Improved temp file handling

* Improved temp file handling. Transfers of content can now be resumed
  from temp files later; the resume does not have to be the immediate
  next git-annex run.
* unused: Include partially transferred content in the list.
This commit is contained in:
Joey Hess 2011-01-28 14:10:50 -04:00
parent 04fe906ac6
commit e6da7eb177
8 changed files with 87 additions and 40 deletions

View file

@ -105,13 +105,3 @@ shutdown errnum = do
unless (q == GitQueue.empty) $ do
showSideAction "Recording state in git..."
Annex.queueRun
-- If nothing failed, clean up any files left in the temp directory,
-- but leave the directory itself. If something failed, temp files
-- are left behind to allow resuming on re-run.
when (errnum == 0) $ do
g <- Annex.gitRepo
let tmp = gitAnnexTmpDir g
exists <- liftIO $ doesDirectoryExist tmp
when exists $ liftIO $ removeDirectoryRecursive tmp
liftIO $ createDirectoryIfMissing True tmp

View file

@ -7,6 +7,7 @@
module Command.DropUnused where
import Control.Monad (when)
import Control.Monad.State (liftIO)
import qualified Data.Map as M
import System.Directory
@ -33,8 +34,14 @@ start s = do
case M.lookup s m of
Nothing -> return Nothing
Just key -> do
g <- Annex.gitRepo
showStart "dropunused" s
backend <- keyBackend key
-- drop both content in the backend and any tmp
-- file for the key
let tmp = gitAnnexTmpLocation g key
tmp_exists <- liftIO $ doesFileExist tmp
when tmp_exists $ liftIO $ removeFile tmp
return $ Just $ Command.Drop.perform key backend (Just 0)
readUnusedLog :: Annex (M.Map String Key)

View file

@ -7,9 +7,12 @@
module Command.Unused where
import Control.Monad (filterM, unless)
import Control.Monad.State (liftIO)
import qualified Data.Map as M
import Data.Maybe
import System.FilePath
import System.Directory
import Command
import Types
@ -41,49 +44,71 @@ perform = do
checkUnused :: Annex Bool
checkUnused = do
showNote "checking for unused data..."
unused <- unusedKeys
let list = number 1 unused
(unused, staletmp) <- unusedKeys
let unusedlist = number 0 unused
let staletmplist = number (length unused) staletmp
let list = unusedlist ++ staletmplist
g <- Annex.gitRepo
liftIO $ safeWriteFile (gitAnnexUnusedLog g) $ unlines $
map (\(n, k) -> show n ++ " " ++ show k) list
if null unused
then return True
else do
showLongNote $ w list
return False
unless (null unused) $
showLongNote $ unusedmsg unusedlist
unless (null staletmp) $
showLongNote $ staletmpmsg staletmplist
unless (null list) $
showLongNote $ "\n"
return $ null list
where
w u = unlines $
["Some annexed data is no longer pointed to by any files in the repository:",
" NUMBER KEY"]
++ map cols u ++
["(To see where data was previously used, try: git log --stat -S'KEY')",
"(To remove unwanted data: git-annex dropunused NUMBER)",
""]
unusedmsg u = unlines $
["Some annexed data is no longer pointed to by any files in the repository:"]
++ table u ++
["(To see where data was previously used, try: git log --stat -S'KEY')"] ++
dropmsg
staletmpmsg t = unlines $
["Some partially transferred data exists in temporary files:"]
++ table t ++ dropmsg
dropmsg = ["(To remove unwanted data: git-annex dropunused NUMBER)"]
table l = [" NUMBER KEY"] ++ map cols l
cols (n,k) = " " ++ pad 6 (show n) ++ " " ++ show k
pad n s = s ++ replicate (n - length s) ' '
number :: Integer -> [a] -> [(Integer, a)]
number :: Int -> [a] -> [(Int, a)]
number _ [] = []
number n (x:xs) = (n, x):(number (n+1) xs)
number n (x:xs) = (n+1, x):(number (n+1) xs)
{- Finds keys whose content is present, but that do not seem to be used
- by any files in the git repo. -}
unusedKeys :: Annex [Key]
- by any files in the git repo, or that are only present as tmp files. -}
unusedKeys :: Annex ([Key], [Key])
unusedKeys = do
g <- Annex.gitRepo
present <- getKeysPresent
referenced <- getKeysReferenced
-- Constructing a single map, of the set that tends to be smaller,
-- appears more efficient in both memory and CPU than constructing
-- and taking the M.difference of two maps.
let present_m = existsMap present
let unused_m = remove referenced present_m
return $ M.keys unused_m
where
remove a b = foldl (flip M.delete) b a
let unused = present `exclude` referenced
existsMap :: Ord k => [k] -> M.Map k Int
existsMap l = M.fromList $ map (\k -> (k, 1)) l
-- Some tmp files may be dups copies of content that is fully present.
-- Simply delete those, while including the keys for the rest of
-- the temp files in the returned list for the user to deal with.
tmps <- tmpKeys
let staletmp = tmps `exclude` present
let duptmp = tmps `exclude` staletmp
_ <- liftIO $ mapM (\t -> removeFile $ gitAnnexTmpLocation g t) duptmp
return (unused, staletmp)
where
-- Constructing a single map, of the set that tends to be
-- smaller, appears more efficient in both memory and CPU
-- than constructing and taking the M.difference of two maps.
exclude [] _ = [] -- optimisation
exclude smaller larger = M.keys $ remove larger $ existsMap smaller
existsMap :: Ord k => [k] -> M.Map k Int
existsMap l = M.fromList $ map (\k -> (k, 1)) l
remove a b = foldl (flip M.delete) b a
{- List of keys referenced by symlinks in the git repo. -}
getKeysReferenced :: Annex [Key]
@ -92,3 +117,17 @@ getKeysReferenced = do
files <- liftIO $ Git.inRepo g [Git.workTree g]
keypairs <- mapM Backend.lookupFile files
return $ map fst $ catMaybes keypairs
{- List of keys that have temp files in the git repo. -}
tmpKeys :: Annex [Key]
tmpKeys = do
g <- Annex.gitRepo
let tmp = gitAnnexTmpDir g
exists <- liftIO $ doesDirectoryExist tmp
if (not exists)
then return []
else do
contents <- liftIO $ getDirectoryContents tmp
files <- liftIO $ filterM doesFileExist $
map (tmp </>) contents
return $ map (fileKey . takeFileName) files

View file

@ -67,7 +67,7 @@ logStatus key status = do
getViaTmp :: Key -> (FilePath -> Annex Bool) -> Annex Bool
getViaTmp key action = do
g <- Annex.gitRepo
let tmp = gitAnnexTmpDir g </> keyFile key
let tmp = gitAnnexTmpLocation g key
liftIO $ createDirectoryIfMissing True (parentDir tmp)
success <- action tmp
if success

View file

@ -15,6 +15,7 @@ module Locations (
gitAnnexDir,
gitAnnexObjectDir,
gitAnnexTmpDir,
gitAnnexTmpLocation,
gitAnnexBadDir,
gitAnnexUnusedLog,
isLinkToAnnex,
@ -83,6 +84,10 @@ gitAnnexObjectDir r = addTrailingPathSeparator $ Git.workTree r </> objectDir
gitAnnexTmpDir :: Git.Repo -> FilePath
gitAnnexTmpDir r = addTrailingPathSeparator $ gitAnnexDir r </> "tmp"
{- The temp file to use for a given key. -}
gitAnnexTmpLocation :: Git.Repo -> Key -> FilePath
gitAnnexTmpLocation r key = gitAnnexTmpDir r </> keyFile key
{- .git-annex/bad/ is used for bad files found during fsck -}
gitAnnexBadDir :: Git.Repo -> FilePath
gitAnnexBadDir r = addTrailingPathSeparator $ gitAnnexDir r </> "bad"

4
debian/changelog vendored
View file

@ -8,6 +8,10 @@ git-annex (0.19) UNRELEASED; urgency=low
* fsck, drop: Take untrusted repositories into account.
* Bugfix: Files were copied from trusted remotes first even if their
annex.cost was higher than other remotes.
* Improved temp file handling. Transfers of content can now be resumed
from temp files later; the resume does not have to be the immediate
next git-annex run.
* unused: Include partially transferred content in the list.
-- Joey Hess <joeyh@debian.org> Wed, 19 Jan 2011 18:07:51 -0400

View file

@ -9,3 +9,5 @@ This presents 2 problems:
finished.
--[[Joey]]
[[done]]

View file

@ -132,7 +132,7 @@ Many git-annex commands will stage changes for later `git commit` by you.
* unused
Checks the annex for data that is not used by any files currently
in the annex, and prints a numbered list of the data.
in the annex, and prints a numbered list of the data.
* dropunused [number ...]