
351 lines
12 KiB
Raw Normal View History

{- git-annex command
- Copyright 2010-2012 Joey Hess <>
- Licensed under the GNU GPL version 3 or higher.
{-# LANGUAGE BangPatterns #-}
module Command.Unused where
import Control.Monad.ST
import qualified Data.Map as M
2011-10-05 16:02:51 -04:00
import Common.Annex
import Command
import Logs.Unused
2011-10-04 00:40:47 -04:00
import Annex.Content
2011-10-15 16:21:08 -04:00
import Logs.Location
2010-11-15 18:04:19 -04:00
import qualified Annex
import qualified Git
2011-12-14 15:56:11 -04:00
import qualified Git.Command
2011-12-12 18:23:24 -04:00
import qualified Git.Ref
import qualified Git.Branch
import qualified Git.LsFiles as LsFiles
import qualified Git.DiffTree as DiffTree
import qualified Backend
2011-04-02 20:59:41 -04:00
import qualified Remote
2011-10-04 00:40:47 -04:00
import qualified Annex.Branch
import Annex.CatFile
import Types.Key
import Types.RefSpec
import Git.FilePath
import Logs.View (is_branchView)
import Utility.Bloom
cmd :: [Command]
cmd = [withOptions [unusedFromOption, refSpecOption] $
command "unused" paramNothing seek
SectionMaintenance "look for unused file content"]
2014-01-26 16:25:55 -04:00
unusedFromOption :: Option
unusedFromOption = fieldOption ['f'] "from" paramRemote "remote to check for unused content"
refSpecOption :: Option
refSpecOption = fieldOption [] "used-refspec" paramRefSpec "refs to consider used (default: all refs)"
seek :: CommandSeek
seek = withNothing start
{- Finds unused content in the annex. -}
start :: CommandStart
start = do
2015-05-14 15:44:08 -04:00
cfgrefspec <- fromMaybe allRefSpec . annexUsedRefSpec
<$> Annex.getGitConfig
!refspec <- maybe cfgrefspec (either error id . parseRefSpec)
<$> Annex.getField (optionName refSpecOption)
from <- Annex.getField (optionName unusedFromOption)
let (name, action) = case from of
Nothing -> (".", checkUnused refspec)
Just "." -> (".", checkUnused refspec)
Just "here" -> (".", checkUnused refspec)
Just n -> (n, checkRemoteUnused n refspec)
showStart "unused" name
next action
checkUnused :: RefSpec -> CommandPerform
checkUnused refspec = chain 0
2012-03-11 21:08:48 -04:00
[ check "" unusedMsg $ findunused =<< Annex.getState
, check "bad" staleBadMsg $ staleKeysPrune gitAnnexBadDir False
, check "tmp" staleTmpMsg $ staleKeysPrune gitAnnexTmpObjectDir True
2012-03-11 21:08:48 -04:00
2012-11-12 01:05:04 -04:00
findunused True = do
showNote "fast mode enabled; only finding stale files"
return []
findunused False = do
showAction "checking for unused data"
-- InAnnex, not InRepository because if a direct mode
-- file exists, it is obviously not unused.
excludeReferenced refspec =<< getKeysPresent InAnnex
2012-11-12 01:05:04 -04:00
chain _ [] = next $ return True
chain v (a:as) = do
v' <- a v
chain v' as
checkRemoteUnused :: String -> RefSpec -> CommandPerform
checkRemoteUnused name refspec = go =<< fromJust <$> Remote.byNameWithUUID (Just name)
2012-11-12 01:05:04 -04:00
go r = do
showAction "checking for unused data"
_ <- check "" (remoteUnusedMsg r) (remoteunused r) 0
next $ return True
remoteunused r = excludeReferenced refspec <=< loggedKeysFor $ Remote.uuid r
2012-03-11 21:08:48 -04:00
check :: FilePath -> ([(Int, Key)] -> String) -> Annex [Key] -> Int -> Annex Int
check file msg a c = do
l <- a
let unusedlist = number c l
unless (null l) $ showLongNote $ msg unusedlist
updateUnusedLog file $ M.fromList unusedlist
2012-03-11 21:08:48 -04:00
return $ c + length l
2011-05-15 02:49:43 -04:00
2012-03-11 21:08:48 -04:00
number :: Int -> [a] -> [(Int, a)]
number _ [] = []
number n (x:xs) = (n+1, x) : number (n+1) xs
2011-04-02 20:59:41 -04:00
table :: [(Int, Key)] -> [String]
2011-07-15 12:47:14 -04:00
table l = " NUMBER KEY" : map cols l
2012-11-12 01:05:04 -04:00
cols (n,k) = " " ++ pad 6 (show n) ++ " " ++ key2file k
pad n s = s ++ replicate (n - length s) ' '
2010-11-15 18:04:19 -04:00
2011-04-02 20:59:41 -04:00
staleTmpMsg :: [(Int, Key)] -> String
staleTmpMsg t = unlines $
["Some partially transferred data exists in temporary files:"]
++ table t ++ [dropMsg Nothing]
staleBadMsg :: [(Int, Key)] -> String
staleBadMsg t = unlines $
["Some corrupted files have been preserved by fsck, just in case:"]
++ table t ++ [dropMsg Nothing]
2011-04-02 20:59:41 -04:00
unusedMsg :: [(Int, Key)] -> String
unusedMsg u = unusedMsg' u
["Some annexed data is no longer used by any files:"]
[dropMsg Nothing]
2011-04-02 20:59:41 -04:00
unusedMsg' :: [(Int, Key)] -> [String] -> [String] -> String
unusedMsg' u header trailer = unlines $
header ++
table u ++
["(To see where data was previously used, try: git log --stat -S'KEY')"] ++
2011-12-31 04:11:39 -04:00
remoteUnusedMsg :: Remote -> [(Int, Key)] -> String
remoteUnusedMsg r u = unusedMsg' u
["Some annexed data on " ++ name ++ " is not used by any files:"]
[dropMsg $ Just r]
2012-11-12 01:05:04 -04:00
name = r
2011-12-31 04:11:39 -04:00
dropMsg :: Maybe Remote -> String
2011-04-02 20:59:41 -04:00
dropMsg Nothing = dropMsg' ""
dropMsg (Just r) = dropMsg' $ " --from " ++ r
dropMsg' :: String -> String
dropMsg' s = "\nTo remove unwanted data: git-annex dropunused" ++ s ++ " NUMBER\n"
2011-04-02 20:59:41 -04:00
2012-03-12 15:21:20 -04:00
{- Finds keys in the list that are not referenced in the git repository.
- Strategy:
- * Build a bloom filter of all keys referenced by symlinks. This
- is the fastest one to build and will filter out most keys.
- * If keys remain, build a second bloom filter of keys referenced by
- branches maching the RefSpec.
2012-03-12 15:21:20 -04:00
- * The list is streamed through these bloom filters lazily, so both will
- exist at the same time. This means that twice the memory is used,
- but they're relatively small, so the added complexity of using a
- mutable bloom filter does not seem worthwhile.
- * Generating the second bloom filter can take quite a while, since
- it needs enumerating all keys in all git branches. But, the common
- case, if the second filter is needed, is for some keys to be globally
- unused, and in that case, no short-circuit is possible.
- Short-circuiting if the first filter filters all the keys handles the
- other common case.
excludeReferenced :: RefSpec -> [Key] -> Annex [Key]
excludeReferenced refspec ks = runfilter firstlevel ks >>= runfilter secondlevel
2012-11-12 01:05:04 -04:00
runfilter _ [] = return [] -- optimisation
runfilter a l = bloomFilter show l <$> genBloomFilter show a
firstlevel = withKeysReferencedM
secondlevel = withKeysReferencedInGit refspec
{- A bloom filter capable of holding half a million keys with a
- false positive rate of 1 in 1000 uses around 8 mb of memory,
- so will easily fit on even my lowest memory systems.
bloomCapacity :: Annex Int
bloomCapacity = fromMaybe 500000 . annexBloomCapacity <$> Annex.getGitConfig
bloomAccuracy :: Annex Int
bloomAccuracy = fromMaybe 1000 . annexBloomAccuracy <$> Annex.getGitConfig
bloomBitsHashes :: Annex (Int, Int)
bloomBitsHashes = do
capacity <- bloomCapacity
accuracy <- bloomAccuracy
case safeSuggestSizing capacity (1 / fromIntegral accuracy) of
Left e -> do
warning $ "bloomfilter " ++ e ++ "; falling back to sane value"
-- precaulculated value for 500000 (1/1000)
return (8388608,10)
Right v -> return v
2012-03-12 15:21:20 -04:00
{- Creates a bloom filter, and runs an action, such as withKeysReferenced,
- to populate it.
- The action is passed a callback that it can use to feed values into the
- bloom filter.
- Once the action completes, the mutable filter is frozen
- for later use.
2012-03-12 15:21:20 -04:00
genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t)
genBloomFilter convert populate = do
(numbits, numhashes) <- bloomBitsHashes
bloom <- lift $ newMB (cheapHashes numhashes) numbits
2012-03-12 15:21:20 -04:00
_ <- populate $ \v -> lift $ insertMB bloom (convert v)
lift $ unsafeFreezeMB bloom
2012-11-12 01:05:04 -04:00
lift = liftIO . stToIO
2012-03-12 15:21:20 -04:00
bloomFilter :: Hashable t => (v -> t) -> [v] -> Bloom t -> [v]
bloomFilter convert l bloom = filter (\k -> convert k `notElemB` bloom) l
2012-03-12 15:21:20 -04:00
{- Given an initial value, folds it with each key referenced by
- symlinks in the git repo. -}
withKeysReferenced :: v -> (Key -> v -> v) -> Annex v
withKeysReferenced initial a = withKeysReferenced' Nothing initial folda
2012-11-12 01:05:04 -04:00
folda k _ v = return $ a k v
2012-03-12 15:21:20 -04:00
{- Runs an action on each referenced key in the git repo. -}
withKeysReferencedM :: (Key -> Annex ()) -> Annex ()
withKeysReferencedM a = withKeysReferenced' Nothing () calla
2012-11-12 01:05:04 -04:00
calla k _ _ = a k
2012-03-12 15:21:20 -04:00
{- Folds an action over keys and files referenced in a particular directory. -}
withKeysFilesReferencedIn :: FilePath -> v -> (Key -> FilePath -> v -> Annex v) -> Annex v
withKeysFilesReferencedIn = withKeysReferenced' . Just
withKeysReferenced' :: Maybe FilePath -> v -> (Key -> FilePath -> v -> Annex v) -> Annex v
withKeysReferenced' mdir initial a = do
(files, clean) <- getfiles
r <- go initial files
liftIO $ void clean
return r
2012-11-12 01:05:04 -04:00
getfiles = case mdir of
Nothing -> ifM isBareRepo
( return ([], return True)
, do
top <- fromRepo Git.repoPath
inRepo $ LsFiles.allFiles [top]
Just dir -> inRepo $ LsFiles.inRepo [dir]
2012-11-12 01:05:04 -04:00
go v [] = return v
go v (f:fs) = do
x <- Backend.lookupFile f
case x of
Nothing -> go v fs
Just k -> do
!v' <- a k f v
2012-11-12 01:05:04 -04:00
go v' fs
2012-03-12 15:21:20 -04:00
withKeysReferencedInGit :: RefSpec -> (Key -> Annex ()) -> Annex ()
withKeysReferencedInGit refspec a = do
current <- inRepo Git.Branch.currentUnsafe
shaHead <- maybe (return Nothing) (inRepo . Git.Ref.sha) current
usedrefs <- applyRefSpec refspec . relevantrefs (shaHead, current)
<$> inRepo (Git.Command.pipeReadStrict [Param "show-ref"])
forM_ usedrefs $
withKeysReferencedInGitRef a
2012-11-12 01:05:04 -04:00
relevantrefs headRef = addHead headRef .
2012-11-12 01:05:04 -04:00
filter ourbranches .
map (separate (== ' ')) .
nubRefs = map (Git.Ref . snd) . nubBy (\(x, _) (y, _) -> x == y)
ourbranchend = '/' : Git.fromRef
2012-11-12 01:05:04 -04:00
ourbranches (_, b) = not (ourbranchend `isSuffixOf` b)
&& not ("refs/synced/" `isPrefixOf` b)
&& not (is_branchView (Git.Ref b))
addHead headRef refs = case headRef of
-- if HEAD diverges from all branches (except the branch it
-- points to), run the actions on staged keys (and keys
-- that are only present in the work tree if the repo is
-- non bare)
(Just (Git.Ref x), Just (Git.Ref b))
| all (\(x',b') -> x /= x' || b == b') refs ->
: nubRefs (filter ((/= x) . fst) refs)
_ -> nubRefs refs
2012-03-12 15:21:20 -04:00
{- Runs an action on keys referenced in the given Git reference which
- differ from those referenced in the index. -}
2012-03-12 15:21:20 -04:00
withKeysReferencedInGitRef :: (Key -> Annex ()) -> Git.Ref -> Annex ()
withKeysReferencedInGitRef a ref = do
2011-12-12 18:23:24 -04:00
showAction $ "checking " ++ Git.Ref.describe ref
bare <- isBareRepo
(ts,clean) <- inRepo $ if bare
then DiffTree.diffIndex ref
else DiffTree.diffWorkTree ref
let lookAtWorkingTree = not bare && ref == Git.Ref.headRef
forM_ ts $ tKey lookAtWorkingTree >=> maybe noop a
liftIO $ void clean
tKey True = Backend.lookupFile . getTopFilePath . DiffTree.file
tKey False = fileKey . takeFileName . decodeBS <$$>
catFile ref . getTopFilePath . DiffTree.file
data UnusedMaps = UnusedMaps
{ unusedMap :: UnusedMap
, unusedBadMap :: UnusedMap
, unusedTmpMap :: UnusedMap
withUnusedMaps :: (UnusedMaps -> Int -> CommandStart) -> CommandSeek
withUnusedMaps a params = do
unused <- readUnusedMap ""
unusedbad <- readUnusedMap "bad"
unusedtmp <- readUnusedMap "tmp"
let m = unused `M.union` unusedbad `M.union` unusedtmp
let unusedmaps = UnusedMaps unused unusedbad unusedtmp
seekActions $ return $ map (a unusedmaps) $
concatMap (unusedSpec m) params
unusedSpec :: UnusedMap -> String -> [Int]
unusedSpec m spec
| spec == "all" = if M.null m
then []
else [fst (M.findMin m)..fst (M.findMax m)]
| "-" `isInfixOf` spec = range $ separate (== '-') spec
| otherwise = maybe badspec (: []) (readish spec)
range (a, b) = case (readish a, readish b) of
(Just x, Just y) -> [x..y]
_ -> badspec
badspec = error $ "Expected number or range, not \"" ++ spec ++ "\""
{- Seek action for unused content. Finds the number in the maps, and
- calls one of 3 actions, depending on the type of unused file. -}
startUnused :: String
-> (Key -> CommandPerform)
-> (Key -> CommandPerform)
-> (Key -> CommandPerform)
-> UnusedMaps -> Int -> CommandStart
startUnused message unused badunused tmpunused maps n = search
[ (unusedMap maps, unused)
, (unusedBadMap maps, badunused)
, (unusedTmpMap maps, tmpunused)
search [] = error $ show n ++ " not valid (run git annex unused for list)"
search ((m, a):rest) =
case M.lookup n m of
Nothing -> search rest
Just key -> do
showStart message (show n)
next $ a key