git-annex/Command/Unused.hs

246 lines
7.4 KiB
Haskell
Raw Normal View History

{- git-annex command
-
2011-09-29 00:12:11 +00:00
- Copyright 2010-2011 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
{-# LANGUAGE BangPatterns #-}
module Command.Unused where
import qualified Data.Set as S
import qualified Data.ByteString.Lazy.Char8 as L
2011-10-05 20:02:51 +00:00
import Common.Annex
import Command
2011-10-04 04:40:47 +00:00
import Annex.Content
import Utility.FileMode
2011-10-16 04:31:25 +00:00
import Utility.TempFile
2011-10-15 20:21:08 +00:00
import Logs.Location
2010-11-15 22:04:19 +00:00
import qualified Annex
import qualified Git
import qualified Git.LsFiles as LsFiles
import qualified Git.LsTree as LsTree
import qualified Backend
2011-04-03 00:59:41 +00:00
import qualified Remote
2011-10-04 04:40:47 +00:00
import qualified Annex.Branch
import Annex.CatFile
command :: [Command]
command = [Command "unused" paramNothing (noTo >> needsRepo) seek
"look for unused file content"]
seek :: [CommandSeek]
seek = [withNothing start]
{- Finds unused content in the annex. -}
start :: CommandStart
start = notBareRepo $ do
from <- Annex.getState Annex.fromremote
let (name, action) = case from of
Nothing -> (".", checkUnused)
Just "." -> (".", checkUnused)
Just n -> (n, checkRemoteUnused n)
showStart "unused" name
next action
checkUnused :: CommandPerform
checkUnused = do
(unused, stalebad, staletmp) <- unusedKeys
2011-05-29 02:24:48 +00:00
_ <- list "" unusedMsg unused 0 >>=
list "bad" staleBadMsg stalebad >>=
list "tmp" staleTmpMsg staletmp
next $ return True
where
list file msg l c = do
let unusedlist = number c l
unless (null l) $ showLongNote $ msg unusedlist
writeUnusedFile file unusedlist
return $ c + length l
checkRemoteUnused :: String -> CommandPerform
checkRemoteUnused name = do
checkRemoteUnused' =<< Remote.byName name
next $ return True
2011-05-15 06:49:43 +00:00
checkRemoteUnused' :: Remote.Remote Annex -> Annex ()
checkRemoteUnused' r = do
showAction "checking for unused data"
remotehas <- filterM isthere =<< loggedKeys
remoteunused <- excludeReferenced remotehas
2011-04-03 00:59:41 +00:00
let list = number 0 remoteunused
writeUnusedFile "" list
unless (null remoteunused) $ showLongNote $ remoteUnusedMsg r list
2011-04-03 00:59:41 +00:00
where
{- This should run strictly to avoid the filterM
- building many thunks containing keyLocations data. -}
2011-04-03 00:59:41 +00:00
isthere k = do
2011-06-22 20:13:43 +00:00
us <- keyLocations k
let !there = uuid `elem` us
return there
2011-04-03 00:59:41 +00:00
uuid = Remote.uuid r
writeUnusedFile :: FilePath -> [(Int, Key)] -> Annex ()
writeUnusedFile prefix l = do
g <- gitRepo
2011-06-30 04:42:09 +00:00
liftIO $ viaTmp writeFile (gitAnnexUnusedLog prefix g) $
2011-04-03 00:59:41 +00:00
unlines $ map (\(n, k) -> show n ++ " " ++ show k) l
table :: [(Int, Key)] -> [String]
2011-07-15 16:47:14 +00:00
table l = " NUMBER KEY" : map cols l
where
cols (n,k) = " " ++ pad 6 (show n) ++ " " ++ show k
2010-11-15 22:04:19 +00:00
pad n s = s ++ replicate (n - length s) ' '
number :: Int -> [a] -> [(Int, a)]
2010-11-15 22:04:19 +00:00
number _ [] = []
2011-07-15 16:47:14 +00:00
number n (x:xs) = (n+1, x) : number (n+1) xs
2011-04-03 00:59:41 +00:00
staleTmpMsg :: [(Int, Key)] -> String
staleTmpMsg t = unlines $
["Some partially transferred data exists in temporary files:"]
++ table t ++ [dropMsg Nothing]
staleBadMsg :: [(Int, Key)] -> String
staleBadMsg t = unlines $
["Some corrupted files have been preserved by fsck, just in case:"]
++ table t ++ [dropMsg Nothing]
2011-04-03 00:59:41 +00:00
unusedMsg :: [(Int, Key)] -> String
unusedMsg u = unusedMsg' u
["Some annexed data is no longer used by any files:"]
[dropMsg Nothing]
2011-04-03 00:59:41 +00:00
unusedMsg' :: [(Int, Key)] -> [String] -> [String] -> String
unusedMsg' u header trailer = unlines $
header ++
table u ++
["(To see where data was previously used, try: git log --stat -S'KEY')"] ++
trailer
remoteUnusedMsg :: Remote.Remote Annex -> [(Int, Key)] -> String
remoteUnusedMsg r u = unusedMsg' u
["Some annexed data on " ++ name ++ " is not used by any files:"]
[dropMsg $ Just r]
where
name = Remote.name r
2011-04-03 00:59:41 +00:00
dropMsg :: Maybe (Remote.Remote Annex) -> String
dropMsg Nothing = dropMsg' ""
dropMsg (Just r) = dropMsg' $ " --from " ++ Remote.name r
dropMsg' :: String -> String
dropMsg' s = "\nTo remove unwanted data: git-annex dropunused" ++ s ++ " NUMBER\n"
2011-04-03 00:59:41 +00:00
{- Finds keys whose content is present, but that do not seem to be used
- by any files in the git repo, or that are only present as bad or tmp
- files. -}
unusedKeys :: Annex ([Key], [Key], [Key])
unusedKeys = do
fast <- Annex.getState Annex.fast
if fast
then do
showNote "fast mode enabled; only finding stale files"
tmp <- staleKeys gitAnnexTmpDir
bad <- staleKeys gitAnnexBadDir
return ([], bad, tmp)
else do
showAction "checking for unused data"
present <- getKeysPresent
unused <- excludeReferenced present
staletmp <- staleKeysPrune gitAnnexTmpDir present
stalebad <- staleKeysPrune gitAnnexBadDir present
return (unused, stalebad, staletmp)
{- Finds keys in the list that are not referenced in the git repository. -}
excludeReferenced :: [Key] -> Annex [Key]
2011-09-28 21:38:41 +00:00
excludeReferenced [] = return [] -- optimisation
excludeReferenced l = do
g <- gitRepo
c <- liftIO $ Git.pipeRead g [Param "show-ref"]
2011-09-29 00:12:11 +00:00
removewith (getKeysReferenced : map getKeysReferencedInGit (refs c))
(S.fromList l)
where
-- Skip the git-annex branches, and get all other unique refs.
refs = map last .
nubBy cmpheads .
filter ourbranches .
map words . lines . L.unpack
cmpheads a b = head a == head b
2011-10-04 04:40:47 +00:00
ourbranchend = '/' : Annex.Branch.name
ourbranches ws = not $ ourbranchend `isSuffixOf` last ws
2011-09-29 00:12:11 +00:00
removewith [] s = return $ S.toList s
removewith (a:as) s
| s == S.empty = return [] -- optimisation
| otherwise = do
referenced <- a
let !s' = s `S.difference` S.fromList referenced
removewith as s'
2011-04-03 00:59:41 +00:00
{- Finds items in the first, smaller list, that are not
- present in the second, larger list.
-
- Constructing a single set, of the list that tends to be
- smaller, appears more efficient in both memory and CPU
- than constructing and taking the S.difference of two sets. -}
exclude :: Ord a => [a] -> [a] -> [a]
exclude [] _ = [] -- optimisation
exclude smaller larger = S.toList $ remove larger $ S.fromList smaller
where
remove a b = foldl (flip S.delete) b a
{- List of keys referenced by symlinks in the git repo. -}
getKeysReferenced :: Annex [Key]
getKeysReferenced = do
g <- gitRepo
files <- liftIO $ LsFiles.inRepo g [Git.workTree g]
keypairs <- mapM Backend.lookupFile files
return $ map fst $ catMaybes keypairs
{- List of keys referenced by symlinks in a git ref. -}
getKeysReferencedInGit :: String -> Annex [Key]
getKeysReferencedInGit ref = do
showAction $ "checking " ++ Git.refDescribe ref
g <- gitRepo
findkeys [] =<< liftIO (LsTree.lsTree g ref)
where
findkeys c [] = return c
2011-09-29 00:12:11 +00:00
findkeys c (l:ls)
| isSymLink (LsTree.mode l) = do
content <- catFile ref $ LsTree.file l
case fileKey (takeFileName content) of
Nothing -> findkeys c ls
Just k -> findkeys (k:c) ls
| otherwise = findkeys c ls
{- Looks in the specified directory for bad/tmp keys, and returns a list
- of those that might still have value, or might be stale and removable.
-
- When a list of presently available keys is provided, stale keys
- that no longer have value are deleted.
-}
staleKeysPrune :: (Git.Repo -> FilePath) -> [Key] -> Annex [Key]
staleKeysPrune dirspec present = do
contents <- staleKeys dirspec
2011-09-23 22:13:24 +00:00
let stale = contents `exclude` present
let dups = contents `exclude` stale
g <- gitRepo
let dir = dirspec g
liftIO $ forM_ dups $ \t -> removeFile $ dir </> keyFile t
return stale
staleKeys :: (Git.Repo -> FilePath) -> Annex [Key]
staleKeys dirspec = do
g <- gitRepo
let dir = dirspec g
exists <- liftIO $ doesDirectoryExist dir
if not exists
then return []
else do
contents <- liftIO $ getDirectoryContents dir
files <- liftIO $ filterM doesFileExist $
map (dir </>) contents
2011-07-15 16:47:14 +00:00
return $ mapMaybe (fileKey . takeFileName) files