git-annex/Command/Unused.hs

341 lines
11 KiB
Haskell
Raw Normal View History

{- git-annex command
-
- Copyright 2010-2016 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE BangPatterns #-}
module Command.Unused where
import qualified Data.Map as M
import Command
import Logs.Unused
2011-10-04 04:40:47 +00:00
import Annex.Content
2011-10-15 20:21:08 +00:00
import Logs.Location
2010-11-15 22:04:19 +00:00
import qualified Annex
import qualified Git
2011-12-14 19:56:11 +00:00
import qualified Git.Command
2011-12-12 22:23:24 +00:00
import qualified Git.Ref
import qualified Git.Branch
import qualified Git.RefLog
import qualified Git.LsFiles as LsFiles
import qualified Git.DiffTree as DiffTree
2011-04-03 00:59:41 +00:00
import qualified Remote
2011-10-04 04:40:47 +00:00
import qualified Annex.Branch
import Annex.CatFile
import Annex.WorkTree
import Types.RefSpec
import Git.Types
import Git.Sha
import Git.FilePath
2016-06-02 20:59:15 +00:00
import Config
import Logs.View (is_branchView)
import Annex.BloomFilter
import qualified Database.Keys
import Annex.InodeSentinal
cmd :: Command
2016-01-07 00:41:25 +00:00
cmd = command "unused" SectionMaintenance "look for unused file content"
paramNothing (seek <$$> optParser)
data UnusedOptions = UnusedOptions
{ fromRemote :: Maybe RemoteName
, refSpecOption :: Maybe RefSpec
}
optParser :: CmdParamsDesc -> Parser UnusedOptions
optParser _ = UnusedOptions
<$> optional (strOption
( long "from" <> short 'f' <> metavar paramRemote
<> help "remote to check for unused content"
))
<*> optional (option (eitherReader parseRefSpec)
( long "used-refspec" <> metavar paramRefSpec
<> help "refs to consider used (default: all branches)"
))
seek :: UnusedOptions -> CommandSeek
seek = commandAction . start
start :: UnusedOptions -> CommandStart
start o = do
2015-05-14 19:44:08 +00:00
cfgrefspec <- fromMaybe allRefSpec . annexUsedRefSpec
<$> Annex.getGitConfig
let refspec = fromMaybe cfgrefspec (refSpecOption o)
let (name, perform) = case fromRemote o of
Nothing -> (".", checkUnused refspec)
Just "." -> (".", checkUnused refspec)
Just "here" -> (".", checkUnused refspec)
Just n -> (n, checkRemoteUnused n refspec)
showStart' "unused" (Just name)
next perform
checkUnused :: RefSpec -> CommandPerform
checkUnused refspec = chain 0
2012-03-12 01:08:48 +00:00
[ check "" unusedMsg $ findunused =<< Annex.getState Annex.fast
, check "bad" staleBadMsg $ staleKeysPrune gitAnnexBadDir False
, check "tmp" staleTmpMsg $ staleKeysPrune gitAnnexTmpObjectDir True
2012-03-12 01:08:48 +00:00
]
2012-11-12 05:05:04 +00:00
where
findunused True = do
showNote "fast mode enabled; only finding stale files"
return []
findunused False = do
showAction "checking for unused data"
-- InAnnex, not InRepository because if a direct mode
-- file exists, it is obviously not unused.
excludeReferenced refspec =<< getKeysPresent InAnnex
2012-11-12 05:05:04 +00:00
chain _ [] = next $ return True
chain v (a:as) = do
v' <- a v
chain v' as
checkRemoteUnused :: RemoteName -> RefSpec -> CommandPerform
checkRemoteUnused remotename refspec = go =<< Remote.nameToUUID remotename
2012-11-12 05:05:04 +00:00
where
go u = do
2012-11-12 05:05:04 +00:00
showAction "checking for unused data"
r <- Remote.byUUID u
_ <- check "" (remoteUnusedMsg r remotename) (remoteunused u) 0
2012-11-12 05:05:04 +00:00
next $ return True
remoteunused u = excludeReferenced refspec =<< loggedKeysFor u
2012-03-12 01:08:48 +00:00
check :: FilePath -> ([(Int, Key)] -> String) -> Annex [Key] -> Int -> Annex Int
check file msg a c = do
l <- a
let unusedlist = number c l
unless (null l) $ showLongNote $ msg unusedlist
updateUnusedLog file $ M.fromList unusedlist
2012-03-12 01:08:48 +00:00
return $ c + length l
2011-05-15 06:49:43 +00:00
2012-03-12 01:08:48 +00:00
number :: Int -> [a] -> [(Int, a)]
number _ [] = []
number n (x:xs) = (n+1, x) : number (n+1) xs
2011-04-03 00:59:41 +00:00
table :: [(Int, Key)] -> [String]
2011-07-15 16:47:14 +00:00
table l = " NUMBER KEY" : map cols l
2012-11-12 05:05:04 +00:00
where
cols (n,k) = " " ++ pad 6 (show n) ++ " " ++ serializeKey k
2012-11-12 05:05:04 +00:00
pad n s = s ++ replicate (n - length s) ' '
2010-11-15 22:04:19 +00:00
2011-04-03 00:59:41 +00:00
staleTmpMsg :: [(Int, Key)] -> String
staleTmpMsg t = unlines $
["Some partially transferred data exists in temporary files:"]
++ table t ++ [dropMsg Nothing]
staleBadMsg :: [(Int, Key)] -> String
staleBadMsg t = unlines $
["Some corrupted files have been preserved by fsck, just in case:"]
++ table t ++ [dropMsg Nothing]
2011-04-03 00:59:41 +00:00
unusedMsg :: [(Int, Key)] -> String
unusedMsg u = unusedMsg' u
["Some annexed data is no longer used by any files:"]
[dropMsg Nothing]
2011-04-03 00:59:41 +00:00
unusedMsg' :: [(Int, Key)] -> [String] -> [String] -> String
unusedMsg' u mheader mtrailer = unlines $
mheader ++
2011-04-03 00:59:41 +00:00
table u ++
["(To see where data was previously used, try: git log --stat --no-textconv -S'KEY')"] ++
mtrailer
2011-04-03 00:59:41 +00:00
remoteUnusedMsg :: Maybe Remote -> RemoteName -> [(Int, Key)] -> String
remoteUnusedMsg mr remotename u = unusedMsg' u
["Some annexed data on " ++ remotename ++ " is not used by any files:"]
(if isJust mr then [dropMsg (Just remotename)] else [])
dropMsg :: Maybe RemoteName -> String
2011-04-03 00:59:41 +00:00
dropMsg Nothing = dropMsg' ""
dropMsg (Just remotename) = dropMsg' $ " --from " ++ remotename
2011-04-03 00:59:41 +00:00
dropMsg' :: String -> String
dropMsg' s = "\nTo remove unwanted data: git-annex dropunused" ++ s ++ " NUMBER\n"
2011-04-03 00:59:41 +00:00
2012-03-12 19:21:20 +00:00
{- Finds keys in the list that are not referenced in the git repository.
-
- Strategy:
-
- Pass keys through these filters in order, only creating each bloom
- filter on demand if the previous one didn't filter out all keys.
-
- 1. Bloom filter containing all keys referenced by files in the work tree.
- This is the fastest one to build and will filter out most keys.
- 2. Bloom filter containing all keys in the diff from the work tree to
- the index.
- 3. Associated files filter. A v6 unlocked file may have had its content
- added to the annex (by eg, git diff running the smudge filter),
- but the new key is not yet staged in the index. But if so, it will
- have an associated file.
- 4. Bloom filter containing all keys in the diffs between the index and
- branches matching the RefSpec. (This can take quite a while to build).
2012-03-12 19:21:20 +00:00
-}
excludeReferenced :: RefSpec -> [Key] -> Annex [Key]
excludeReferenced refspec ks = runbloomfilter withKeysReferencedM ks
>>= runbloomfilter withKeysReferencedDiffIndex
>>= runfilter associatedFilesFilter
>>= runbloomfilter (withKeysReferencedDiffGitRefs refspec)
2012-11-12 05:05:04 +00:00
where
runfilter _ [] = return [] -- optimisation
runfilter a l = a l
runbloomfilter a = runfilter $ \l -> bloomFilter l <$> genBloomFilter a
2012-03-12 19:21:20 +00:00
{- Given an initial value, folds it with each key referenced by
- files in the working tree. -}
withKeysReferenced :: v -> (Key -> v -> v) -> Annex v
withKeysReferenced initial a = withKeysReferenced' Nothing initial folda
2012-11-12 05:05:04 +00:00
where
folda k _ v = return $ a k v
2012-03-12 19:21:20 +00:00
{- Runs an action on each referenced key in the working tree. -}
2012-03-12 19:21:20 +00:00
withKeysReferencedM :: (Key -> Annex ()) -> Annex ()
withKeysReferencedM a = withKeysReferenced' Nothing () calla
2012-11-12 05:05:04 +00:00
where
calla k _ _ = a k
2012-03-12 19:21:20 +00:00
{- Folds an action over keys and files referenced in a particular directory. -}
withKeysFilesReferencedIn :: FilePath -> v -> (Key -> FilePath -> v -> Annex v) -> Annex v
withKeysFilesReferencedIn = withKeysReferenced' . Just
withKeysReferenced' :: Maybe FilePath -> v -> (Key -> FilePath -> v -> Annex v) -> Annex v
withKeysReferenced' mdir initial a = do
(files, clean) <- getfiles
r <- go initial files
liftIO $ void clean
return r
2012-11-12 05:05:04 +00:00
where
getfiles = case mdir of
Nothing -> ifM isBareRepo
( return ([], return True)
, do
top <- fromRepo Git.repoPath
inRepo $ LsFiles.allFiles [top]
)
Just dir -> inRepo $ LsFiles.inRepo [dir]
2012-11-12 05:05:04 +00:00
go v [] = return v
go v (f:fs) = do
mk <- lookupFile f
case mk of
2012-11-12 05:05:04 +00:00
Nothing -> go v fs
Just k -> do
!v' <- a k f v
2012-11-12 05:05:04 +00:00
go v' fs
2012-03-12 19:21:20 +00:00
withKeysReferencedDiffGitRefs :: RefSpec -> (Key -> Annex ()) -> Annex ()
withKeysReferencedDiffGitRefs refspec a = do
rs <- relevantrefs <$> inRepo (Git.Command.pipeReadStrict [Param "show-ref"])
shaHead <- maybe (return Nothing) (inRepo . Git.Ref.sha)
=<< inRepo Git.Branch.currentUnsafe
let haveHead = any (\(shaRef, _) -> Just shaRef == shaHead) rs
let rs' = map snd (nubRefs rs)
usedrefs <- applyRefSpec refspec rs' (getreflog rs')
forM_ (if haveHead then usedrefs else Git.Ref.headRef : usedrefs) $
withKeysReferencedDiffGitRef a
2012-11-12 05:05:04 +00:00
where
relevantrefs = map (\(r, h) -> (Git.Ref r, Git.Ref h)) .
2012-11-12 05:05:04 +00:00
filter ourbranches .
map (separate (== ' ')) .
lines
nubRefs = nubBy (\(x, _) (y, _) -> x == y)
ourbranchend = '/' : Git.fromRef Annex.Branch.name
2012-11-12 05:05:04 +00:00
ourbranches (_, b) = not (ourbranchend `isSuffixOf` b)
&& not ("refs/synced/" `isPrefixOf` b)
&& not (is_branchView (Git.Ref b))
getreflog rs = inRepo $ Git.RefLog.getMulti rs
2012-03-12 19:21:20 +00:00
{- Runs an action on keys referenced in the given Git reference which
- differ from those referenced in the index. -}
withKeysReferencedDiffGitRef :: (Key -> Annex ()) -> Git.Ref -> Annex ()
withKeysReferencedDiffGitRef a ref = do
2011-12-12 22:23:24 +00:00
showAction $ "checking " ++ Git.Ref.describe ref
withKeysReferencedDiff a
(inRepo $ DiffTree.diffIndex ref)
DiffTree.srcsha
{- Runs an action on keys referenced in the index which differ from the
- work tree. -}
withKeysReferencedDiffIndex :: (Key -> Annex ()) -> Annex ()
withKeysReferencedDiffIndex a = unlessM (isBareRepo) $
withKeysReferencedDiff a
(inRepo $ DiffTree.diffFiles [])
DiffTree.srcsha
withKeysReferencedDiff :: (Key -> Annex ()) -> (Annex ([DiffTree.DiffTreeItem], IO Bool)) -> (DiffTree.DiffTreeItem -> Sha) -> Annex ()
withKeysReferencedDiff a getdiff extractsha = do
(ds, clean) <- getdiff
forM_ ds go
liftIO $ void clean
where
go d = do
let sha = extractsha d
unless (sha == nullSha) $
catKey sha >>= maybe noop a
{- Filters out keys that have an associated file that's not modified. -}
associatedFilesFilter :: [Key] -> Annex [Key]
associatedFilesFilter = filterM go
where
go k = do
cs <- Database.Keys.getInodeCaches k
if null cs
then return True
else checkunmodified cs
=<< Database.Keys.getAssociatedFiles k
checkunmodified _ [] = return True
checkunmodified cs (f:fs) = do
relf <- fromRepo $ fromTopFilePath f
ifM (sameInodeCache relf cs)
( return False
, checkunmodified cs fs
)
data UnusedMaps = UnusedMaps
{ unusedMap :: UnusedMap
, unusedBadMap :: UnusedMap
, unusedTmpMap :: UnusedMap
}
withUnusedMaps :: (UnusedMaps -> Int -> CommandStart) -> CmdParams -> CommandSeek
withUnusedMaps a params = do
unused <- readUnusedMap ""
unusedbad <- readUnusedMap "bad"
unusedtmp <- readUnusedMap "tmp"
let m = unused `M.union` unusedbad `M.union` unusedtmp
let unusedmaps = UnusedMaps unused unusedbad unusedtmp
commandActions $ map (a unusedmaps) $ concatMap (unusedSpec m) params
unusedSpec :: UnusedMap -> String -> [Int]
unusedSpec m spec
| spec == "all" = if M.null m
then []
else [fst (M.findMin m)..fst (M.findMax m)]
| "-" `isInfixOf` spec = range $ separate (== '-') spec
| otherwise = maybe badspec (: []) (readish spec)
where
range (a, b) = case (readish a, readish b) of
(Just x, Just y) -> [x..y]
_ -> badspec
badspec = giveup $ "Expected number or range, not \"" ++ spec ++ "\""
{- Seek action for unused content. Finds the number in the maps, and
- calls one of 3 actions, depending on the type of unused file. -}
startUnused :: String
-> (Key -> CommandPerform)
-> (Key -> CommandPerform)
-> (Key -> CommandPerform)
-> UnusedMaps -> Int -> CommandStart
startUnused message unused badunused tmpunused maps n = search
[ (unusedMap maps, unused)
, (unusedBadMap maps, badunused)
, (unusedTmpMap maps, tmpunused)
]
where
search [] = giveup $ show n ++ " not valid (run git annex unused for list)"
search ((m, a):rest) =
case M.lookup n m of
Nothing -> search rest
Just key -> do
showStart' message (Just $ show n)
next $ a key