Improved temp file handling

* Improved temp file handling. Transfers of content can now be resumed
  from temp files later; the resume does not have to be the immediate
  next git-annex run.
* unused: Include partially transferred content in the list.
This commit is contained in:
Joey Hess 2011-01-28 14:10:50 -04:00
parent 04fe906ac6
commit e6da7eb177
8 changed files with 87 additions and 40 deletions

View file

@ -105,13 +105,3 @@ shutdown errnum = do
unless (q == GitQueue.empty) $ do unless (q == GitQueue.empty) $ do
showSideAction "Recording state in git..." showSideAction "Recording state in git..."
Annex.queueRun Annex.queueRun
-- If nothing failed, clean up any files left in the temp directory,
-- but leave the directory itself. If something failed, temp files
-- are left behind to allow resuming on re-run.
when (errnum == 0) $ do
g <- Annex.gitRepo
let tmp = gitAnnexTmpDir g
exists <- liftIO $ doesDirectoryExist tmp
when exists $ liftIO $ removeDirectoryRecursive tmp
liftIO $ createDirectoryIfMissing True tmp

View file

@ -7,6 +7,7 @@
module Command.DropUnused where module Command.DropUnused where
import Control.Monad (when)
import Control.Monad.State (liftIO) import Control.Monad.State (liftIO)
import qualified Data.Map as M import qualified Data.Map as M
import System.Directory import System.Directory
@ -33,8 +34,14 @@ start s = do
case M.lookup s m of case M.lookup s m of
Nothing -> return Nothing Nothing -> return Nothing
Just key -> do Just key -> do
g <- Annex.gitRepo
showStart "dropunused" s showStart "dropunused" s
backend <- keyBackend key backend <- keyBackend key
-- drop both content in the backend and any tmp
-- file for the key
let tmp = gitAnnexTmpLocation g key
tmp_exists <- liftIO $ doesFileExist tmp
when tmp_exists $ liftIO $ removeFile tmp
return $ Just $ Command.Drop.perform key backend (Just 0) return $ Just $ Command.Drop.perform key backend (Just 0)
readUnusedLog :: Annex (M.Map String Key) readUnusedLog :: Annex (M.Map String Key)

View file

@ -7,9 +7,12 @@
module Command.Unused where module Command.Unused where
import Control.Monad (filterM, unless)
import Control.Monad.State (liftIO) import Control.Monad.State (liftIO)
import qualified Data.Map as M import qualified Data.Map as M
import Data.Maybe import Data.Maybe
import System.FilePath
import System.Directory
import Command import Command
import Types import Types
@ -41,49 +44,71 @@ perform = do
checkUnused :: Annex Bool checkUnused :: Annex Bool
checkUnused = do checkUnused = do
showNote "checking for unused data..." showNote "checking for unused data..."
unused <- unusedKeys (unused, staletmp) <- unusedKeys
let list = number 1 unused let unusedlist = number 0 unused
let staletmplist = number (length unused) staletmp
let list = unusedlist ++ staletmplist
g <- Annex.gitRepo g <- Annex.gitRepo
liftIO $ safeWriteFile (gitAnnexUnusedLog g) $ unlines $ liftIO $ safeWriteFile (gitAnnexUnusedLog g) $ unlines $
map (\(n, k) -> show n ++ " " ++ show k) list map (\(n, k) -> show n ++ " " ++ show k) list
if null unused unless (null unused) $
then return True showLongNote $ unusedmsg unusedlist
else do unless (null staletmp) $
showLongNote $ w list showLongNote $ staletmpmsg staletmplist
return False unless (null list) $
showLongNote $ "\n"
return $ null list
where where
w u = unlines $ unusedmsg u = unlines $
["Some annexed data is no longer pointed to by any files in the repository:", ["Some annexed data is no longer pointed to by any files in the repository:"]
" NUMBER KEY"] ++ table u ++
++ map cols u ++ ["(To see where data was previously used, try: git log --stat -S'KEY')"] ++
["(To see where data was previously used, try: git log --stat -S'KEY')", dropmsg
"(To remove unwanted data: git-annex dropunused NUMBER)", staletmpmsg t = unlines $
""] ["Some partially transferred data exists in temporary files:"]
++ table t ++ dropmsg
dropmsg = ["(To remove unwanted data: git-annex dropunused NUMBER)"]
table l = [" NUMBER KEY"] ++ map cols l
cols (n,k) = " " ++ pad 6 (show n) ++ " " ++ show k cols (n,k) = " " ++ pad 6 (show n) ++ " " ++ show k
pad n s = s ++ replicate (n - length s) ' ' pad n s = s ++ replicate (n - length s) ' '
number :: Integer -> [a] -> [(Integer, a)] number :: Int -> [a] -> [(Int, a)]
number _ [] = [] number _ [] = []
number n (x:xs) = (n, x):(number (n+1) xs) number n (x:xs) = (n+1, x):(number (n+1) xs)
{- Finds keys whose content is present, but that do not seem to be used {- Finds keys whose content is present, but that do not seem to be used
- by any files in the git repo. -} - by any files in the git repo, or that are only present as tmp files. -}
unusedKeys :: Annex [Key] unusedKeys :: Annex ([Key], [Key])
unusedKeys = do unusedKeys = do
g <- Annex.gitRepo
present <- getKeysPresent present <- getKeysPresent
referenced <- getKeysReferenced referenced <- getKeysReferenced
-- Constructing a single map, of the set that tends to be smaller, let unused = present `exclude` referenced
-- appears more efficient in both memory and CPU than constructing
-- and taking the M.difference of two maps.
let present_m = existsMap present
let unused_m = remove referenced present_m
return $ M.keys unused_m
where
remove a b = foldl (flip M.delete) b a
existsMap :: Ord k => [k] -> M.Map k Int -- Some tmp files may be dups copies of content that is fully present.
existsMap l = M.fromList $ map (\k -> (k, 1)) l -- Simply delete those, while including the keys for the rest of
-- the temp files in the returned list for the user to deal with.
tmps <- tmpKeys
let staletmp = tmps `exclude` present
let duptmp = tmps `exclude` staletmp
_ <- liftIO $ mapM (\t -> removeFile $ gitAnnexTmpLocation g t) duptmp
return (unused, staletmp)
where
-- Constructing a single map, of the set that tends to be
-- smaller, appears more efficient in both memory and CPU
-- than constructing and taking the M.difference of two maps.
exclude [] _ = [] -- optimisation
exclude smaller larger = M.keys $ remove larger $ existsMap smaller
existsMap :: Ord k => [k] -> M.Map k Int
existsMap l = M.fromList $ map (\k -> (k, 1)) l
remove a b = foldl (flip M.delete) b a
{- List of keys referenced by symlinks in the git repo. -} {- List of keys referenced by symlinks in the git repo. -}
getKeysReferenced :: Annex [Key] getKeysReferenced :: Annex [Key]
@ -92,3 +117,17 @@ getKeysReferenced = do
files <- liftIO $ Git.inRepo g [Git.workTree g] files <- liftIO $ Git.inRepo g [Git.workTree g]
keypairs <- mapM Backend.lookupFile files keypairs <- mapM Backend.lookupFile files
return $ map fst $ catMaybes keypairs return $ map fst $ catMaybes keypairs
{- List of keys that have temp files in the git repo. -}
tmpKeys :: Annex [Key]
tmpKeys = do
g <- Annex.gitRepo
let tmp = gitAnnexTmpDir g
exists <- liftIO $ doesDirectoryExist tmp
if (not exists)
then return []
else do
contents <- liftIO $ getDirectoryContents tmp
files <- liftIO $ filterM doesFileExist $
map (tmp </>) contents
return $ map (fileKey . takeFileName) files

View file

@ -67,7 +67,7 @@ logStatus key status = do
getViaTmp :: Key -> (FilePath -> Annex Bool) -> Annex Bool getViaTmp :: Key -> (FilePath -> Annex Bool) -> Annex Bool
getViaTmp key action = do getViaTmp key action = do
g <- Annex.gitRepo g <- Annex.gitRepo
let tmp = gitAnnexTmpDir g </> keyFile key let tmp = gitAnnexTmpLocation g key
liftIO $ createDirectoryIfMissing True (parentDir tmp) liftIO $ createDirectoryIfMissing True (parentDir tmp)
success <- action tmp success <- action tmp
if success if success

View file

@ -15,6 +15,7 @@ module Locations (
gitAnnexDir, gitAnnexDir,
gitAnnexObjectDir, gitAnnexObjectDir,
gitAnnexTmpDir, gitAnnexTmpDir,
gitAnnexTmpLocation,
gitAnnexBadDir, gitAnnexBadDir,
gitAnnexUnusedLog, gitAnnexUnusedLog,
isLinkToAnnex, isLinkToAnnex,
@ -83,6 +84,10 @@ gitAnnexObjectDir r = addTrailingPathSeparator $ Git.workTree r </> objectDir
gitAnnexTmpDir :: Git.Repo -> FilePath gitAnnexTmpDir :: Git.Repo -> FilePath
gitAnnexTmpDir r = addTrailingPathSeparator $ gitAnnexDir r </> "tmp" gitAnnexTmpDir r = addTrailingPathSeparator $ gitAnnexDir r </> "tmp"
{- The temp file to use for a given key. -}
gitAnnexTmpLocation :: Git.Repo -> Key -> FilePath
gitAnnexTmpLocation r key = gitAnnexTmpDir r </> keyFile key
{- .git-annex/bad/ is used for bad files found during fsck -} {- .git-annex/bad/ is used for bad files found during fsck -}
gitAnnexBadDir :: Git.Repo -> FilePath gitAnnexBadDir :: Git.Repo -> FilePath
gitAnnexBadDir r = addTrailingPathSeparator $ gitAnnexDir r </> "bad" gitAnnexBadDir r = addTrailingPathSeparator $ gitAnnexDir r </> "bad"

4
debian/changelog vendored
View file

@ -8,6 +8,10 @@ git-annex (0.19) UNRELEASED; urgency=low
* fsck, drop: Take untrusted repositories into account. * fsck, drop: Take untrusted repositories into account.
* Bugfix: Files were copied from trusted remotes first even if their * Bugfix: Files were copied from trusted remotes first even if their
annex.cost was higher than other remotes. annex.cost was higher than other remotes.
* Improved temp file handling. Transfers of content can now be resumed
from temp files later; the resume does not have to be the immediate
next git-annex run.
* unused: Include partially transferred content in the list.
-- Joey Hess <joeyh@debian.org> Wed, 19 Jan 2011 18:07:51 -0400 -- Joey Hess <joeyh@debian.org> Wed, 19 Jan 2011 18:07:51 -0400

View file

@ -9,3 +9,5 @@ This presents 2 problems:
finished. finished.
--[[Joey]] --[[Joey]]
[[done]]

View file

@ -132,7 +132,7 @@ Many git-annex commands will stage changes for later `git commit` by you.
* unused * unused
Checks the annex for data that is not used by any files currently Checks the annex for data that is not used by any files currently
in the annex, and prints a numbered list of the data. in the annex, and prints a numbered list of the data.
* dropunused [number ...] * dropunused [number ...]