unused/dropunused: support --from

This commit is contained in:
Joey Hess 2011-04-02 20:59:41 -04:00
parent 09a16176de
commit 868300d4c1
6 changed files with 147 additions and 44 deletions

View file

@ -19,6 +19,8 @@ import Messages
import Locations
import qualified Annex
import qualified Command.Drop
import qualified Command.Move
import qualified Remote
import Backend
import Key
@ -40,15 +42,28 @@ start m s = notBareRepo $ do
case M.lookup s m of
Nothing -> return Nothing
Just key -> do
g <- Annex.gitRepo
showStart "dropunused" s
backend <- keyBackend key
-- drop both content in the backend and any tmp
-- file for the key
from <- Annex.getState Annex.fromremote
case from of
Just name -> do
r <- Remote.byName name
return $ Just $ performRemote r key
_ -> return $ Just $ perform key
{- drop both content in the backend and any tmp file for the key -}
perform :: Key -> CommandPerform
perform key = do
g <- Annex.gitRepo
let tmp = gitAnnexTmpLocation g key
tmp_exists <- liftIO $ doesFileExist tmp
when tmp_exists $ liftIO $ removeFile tmp
return $ Just $ Command.Drop.perform key backend (Just 0)
backend <- keyBackend key
Command.Drop.perform key backend (Just 0) -- force drop
performRemote :: Remote.Remote Annex -> Key -> CommandPerform
performRemote r key = do
showNote $ "from " ++ Remote.name r ++ "..."
return $ Just $ Command.Move.fromCleanup r True key
readUnusedLog :: Annex (M.Map String Key)
readUnusedLog = do

View file

@ -20,9 +20,11 @@ import Content
import Messages
import Locations
import Utility
import LocationLog
import qualified Annex
import qualified GitRepo as Git
import qualified Backend
import qualified Remote
command :: [Command]
command = [repoCommand "unused" paramNothing seek
@ -39,35 +41,54 @@ start = notBareRepo $ do
perform :: CommandPerform
perform = do
_ <- checkUnused
from <- Annex.getState Annex.fromremote
case from of
Just name -> do
r <- Remote.byName name
checkRemoteUnused r
_ -> checkUnused
return $ Just $ return True
checkUnused :: Annex Bool
checkUnused :: Annex ()
checkUnused = do
(unused, staletmp) <- unusedKeys
let unusedlist = number 0 unused
let staletmplist = number (length unused) staletmp
let list = unusedlist ++ staletmplist
g <- Annex.gitRepo
liftIO $ safeWriteFile (gitAnnexUnusedLog g) $ unlines $
map (\(n, k) -> show n ++ " " ++ show k) list
unless (null unused) $ showLongNote $ unusedmsg unusedlist
unless (null staletmp) $ showLongNote $ staletmpmsg staletmplist
writeUnusedFile list
unless (null unused) $ showLongNote $ unusedMsg unusedlist
unless (null staletmp) $ showLongNote $ staleTmpMsg staletmplist
unless (null list) $ showLongNote $ "\n"
return $ null list
checkRemoteUnused :: Remote.Remote Annex -> Annex ()
checkRemoteUnused r = do
g <- Annex.gitRepo
showNote $ "checking for unused data on " ++ Remote.name r ++ "..."
referenced <- getKeysReferenced
logged <- liftIO $ loggedKeys g
remotehas <- filterM isthere logged
let remoteunused = remotehas `exclude` referenced
let list = number 0 remoteunused
writeUnusedFile list
unless (null remoteunused) $ do
showLongNote $ remoteUnusedMsg r list
showLongNote $ "\n"
where
unusedmsg u = unlines $
["Some annexed data is no longer pointed to by any files in the repository:"]
++ table u ++
["(To see where data was previously used, try: git log --stat -S'KEY')"] ++
dropmsg
staletmpmsg t = unlines $
["Some partially transferred data exists in temporary files:"]
++ table t ++ dropmsg
dropmsg = ["(To remove unwanted data: git-annex dropunused NUMBER)"]
isthere k = do
g <- Annex.gitRepo
us <- liftIO $ keyLocations g k
return $ uuid `elem` us
uuid = Remote.uuid r
writeUnusedFile :: [(Int, Key)] -> Annex ()
writeUnusedFile l = do
g <- Annex.gitRepo
liftIO $ safeWriteFile (gitAnnexUnusedLog g) $
unlines $ map (\(n, k) -> show n ++ " " ++ show k) l
table :: [(Int, Key)] -> [String]
table l = [" NUMBER KEY"] ++ map cols l
where
cols (n,k) = " " ++ pad 6 (show n) ++ " " ++ show k
pad n s = s ++ replicate (n - length s) ' '
@ -75,6 +96,39 @@ number :: Int -> [a] -> [(Int, a)]
number _ [] = []
number n (x:xs) = (n+1, x):(number (n+1) xs)
staleTmpMsg :: [(Int, Key)] -> String
staleTmpMsg t = unlines $
["Some partially transferred data exists in temporary files:"]
++ table t ++ [dropMsg Nothing]
unusedMsg :: [(Int, Key)] -> String
unusedMsg u = unusedMsg' u
["Some annexed data is no longer used by any files in the repository:"]
[dropMsg Nothing]
remoteUnusedMsg :: Remote.Remote Annex -> [(Int, Key)] -> String
remoteUnusedMsg r u = unusedMsg' u
["Some annexed data on " ++ name ++
" is not used by any files in this repository."]
[dropMsg $ Just r,
"Please be cautious -- are you sure that the remote repository",
"does not use this data?"]
where
name = Remote.name r
unusedMsg' :: [(Int, Key)] -> [String] -> [String] -> String
unusedMsg' u header trailer = unlines $
header ++
table u ++
["(To see where data was previously used, try: git log --stat -S'KEY')"] ++
trailer
dropMsg :: Maybe (Remote.Remote Annex) -> String
dropMsg Nothing = dropMsg' ""
dropMsg (Just r) = dropMsg' $ " --from " ++ Remote.name r
dropMsg' :: String -> String
dropMsg' s = "(To remove unwanted data: git-annex dropunused" ++ s ++ " NUMBER)"
{- Finds keys whose content is present, but that do not seem to be used
- by any files in the git repo, or that are only present as tmp files. -}
unusedKeys :: Annex ([Key], [Key])
@ -93,7 +147,9 @@ unusedKeys = do
referenced <- getKeysReferenced
tmps <- tmpKeys
let (unused, staletmp, duptmp) = calcUnusedKeys present referenced tmps
let unused = present `exclude` referenced
let staletmp = tmps `exclude` present
let duptmp = tmps `exclude` staletmp
-- Tmp files that are dups of content already present
-- can simply be removed.
@ -102,18 +158,16 @@ unusedKeys = do
return (unused, staletmp)
calcUnusedKeys :: [Key] -> [Key] -> [Key] -> ([Key], [Key], [Key])
calcUnusedKeys present referenced tmps = (unused, staletmp, duptmp)
where
unused = present `exclude` referenced
staletmp = tmps `exclude` present
duptmp = tmps `exclude` staletmp
-- Constructing a single set, of the list that tends to be
-- smaller, appears more efficient in both memory and CPU
-- than constructing and taking the S.difference of two sets.
{- Finds items in the first, smaller list, that are not
- present in the second, larger list.
-
- Constructing a single set, of the list that tends to be
- smaller, appears more efficient in both memory and CPU
- than constructing and taking the S.difference of two sets. -}
exclude :: Ord a => [a] -> [a] -> [a]
exclude [] _ = [] -- optimisation
exclude smaller larger = S.toList $ remove larger $ S.fromList smaller
where
remove a b = foldl (flip S.delete) b a
{- List of keys referenced by symlinks in the git repo. -}

3
debian/changelog vendored
View file

@ -4,6 +4,9 @@ git-annex (0.20110402) UNRELEASED; urgency=low
some issues with git on OSX with the mixed-case directories.
No migration is needed; the old mixed case hash directories are still
read; new information is written to the new directories.
* Unused files on remotes, particulary special remotes, can now be
identified and dropped, by using "--from remote" with git annex unused
and git annex dropunused.
-- Joey Hess <joeyh@debian.org> Sat, 02 Apr 2011 13:45:54 -0400

View file

@ -155,16 +155,21 @@ Many git-annex commands will stage changes for later `git commit` by you.
* unused
Checks the annex for data that is not used by any files currently
in the annex, and prints a numbered list of the data.
Checks the annex for data that does not correspond to any files currently
in the respository, and prints a numbered list of the data.
To only show unused temp files, specify --fast
To check data on a remote that does not correspond to any files currently
in the local repository, specify --from.
* dropunused [number ...]
Drops the data corresponding to the numbers, as listed by the last
`git annex unused`
To drop the data from a remote, specify --from.
* find [path ...]
Outputs a list of annexed files whose content is currently present.
@ -317,12 +322,15 @@ Many git-annex commands will stage changes for later `git commit` by you.
* --from=repository
Specifies a repository that content will be retrieved from.
Specifies a repository that content will be retrieved from, or that
should otherwise be acted on.
It should be specified using the name of a configured remote.
* --to=repository
Specifies a repository that content will be sent to.
It should be specified using the name of a configured remote.
* --exclude=glob

View file

@ -8,3 +8,26 @@ They cannot be used by other git commands though.
* [[Amazon_S3]]
* [[directory]]
## Unused content on special remotes
Over time, special remotes can accumulate file content that is no longer
referred to by files in git. Normally, unused content in the current
repository is found by running `git annex unused`. To detect unused content
on special remotes, instead use `git annex unused --from`. Example:
$ git annex unused --from mys3
unused (checking for unused data on mys3...)
Some annexed data on mys3 is not used by any files in this repository.
NUMBER KEY
1 WORM-s3-m1301674316--foo
(To see where data was previously used, try: git log --stat -S'KEY')
(To remove unwanted data: git-annex dropunused --from mys3 NUMBER)
Please be cautious -- are you sure that the remote repository
does not use this data?
$ git annex dropunused --from mys3 1
dropunused 12948 (from mys3...) ok
Do be cautious when using this; it cannot detect if content in a remote
is used by that remote, or is the last copy of data that is used by
some *other* remote.

View file

@ -10,7 +10,7 @@ eliminate it to save space.
# git annex unused
unused (checking for unused data...)
Some annexed data is no longer pointed to by any files in the repository.
Some annexed data is no longer used by any files in the repository.
NUMBER KEY
1 WORM-s3-m1289672605--file
2 WORM-s14-m1289672605--file