status: Fixed to run in nearly constant space.

Before, it leaked space due to caching lists of keys. Now all necessary
data about keys is calculated as they stream in.

The "nearly constant" is due to getKeysPresent, which builds up a lot
of [] thunks as it traverses .git/annex/objects/. Will deal with it later.
This commit is contained in:
Joey Hess 2012-03-11 17:15:58 -04:00
parent b086e32c63
commit ff3644ad38
4 changed files with 67 additions and 49 deletions

View file

@ -300,7 +300,7 @@ getKeysPresent' dir = do
-- 2 levels of hashing -- 2 levels of hashing
levela <- dirContents dir levela <- dirContents dir
levelb <- mapM dirContents levela levelb <- mapM dirContents levela
contents <- mapM dirContents (concat levelb) contents <- unsafeInterleaveIO $ mapM dirContents (concat levelb)
let files = concat contents let files = concat contents
return $ mapMaybe (fileKey . takeFileName) files return $ mapMaybe (fileKey . takeFileName) files

View file

@ -5,12 +5,12 @@
- Licensed under the GNU GPL version 3 or higher. - Licensed under the GNU GPL version 3 or higher.
-} -}
{-# LANGUAGE BangPatterns #-}
module Command.Status where module Command.Status where
import Control.Monad.State.Strict import Control.Monad.State.Strict
import qualified Data.Map as M import qualified Data.Map as M
import qualified Data.Set as S
import Data.Set (Set)
import Text.JSON import Text.JSON
import Common.Annex import Common.Annex
@ -32,10 +32,18 @@ import Remote
-- a named computation that produces a statistic -- a named computation that produces a statistic
type Stat = StatState (Maybe (String, StatState String)) type Stat = StatState (Maybe (String, StatState String))
-- cached info that multiple Stats may need -- data about a set of keys
data KeyData = KeyData
{ countKeys :: Integer
, sizeKeys :: Integer
, unknownSizeKeys :: Integer
, backendsKeys :: M.Map String Integer
}
-- cached info that multiple Stats use
data StatInfo = StatInfo data StatInfo = StatInfo
{ keysPresentCache :: Maybe (Set Key) { presentData :: Maybe KeyData
, keysReferencedCache :: Maybe (Set Key) , referencedData :: Maybe KeyData
} }
-- a state monad for running Stats in -- a state monad for running Stats in
@ -122,19 +130,19 @@ remote_list level desc = stat n $ nojson $ lift $ do
local_annex_size :: Stat local_annex_size :: Stat
local_annex_size = stat "local annex size" $ json id $ local_annex_size = stat "local annex size" $ json id $
keySizeSum <$> cachedKeysPresent showSizeKeys <$> cachedPresentData
local_annex_keys :: Stat local_annex_keys :: Stat
local_annex_keys = stat "local annex keys" $ json show $ local_annex_keys = stat "local annex keys" $ json show $
S.size <$> cachedKeysPresent countKeys <$> cachedPresentData
known_annex_size :: Stat known_annex_size :: Stat
known_annex_size = stat "known annex size" $ json id $ known_annex_size = stat "known annex size" $ json id $
keySizeSum <$> cachedKeysReferenced showSizeKeys <$> cachedReferencedData
known_annex_keys :: Stat known_annex_keys :: Stat
known_annex_keys = stat "known annex keys" $ json show $ known_annex_keys = stat "known annex keys" $ json show $
S.size <$> cachedKeysReferenced countKeys <$> cachedReferencedData
tmp_size :: Stat tmp_size :: Stat
tmp_size = staleSize "temporary directory size" gitAnnexTmpDir tmp_size = staleSize "temporary directory size" gitAnnexTmpDir
@ -144,46 +152,62 @@ bad_data_size = staleSize "bad keys size" gitAnnexBadDir
backend_usage :: Stat backend_usage :: Stat
backend_usage = stat "backend usage" $ nojson $ backend_usage = stat "backend usage" $ nojson $
calc <$> cachedKeysReferenced <*> cachedKeysPresent calc
<$> (backendsKeys <$> cachedReferencedData)
<*> (backendsKeys <$> cachedPresentData)
where where
calc a b = pp "" $ reverse . sort $ map swap $ splits $ S.toList $ S.union a b calc a b = pp "" $ reverse . sort $ map swap $ M.toList $ M.unionWith (+) a b
splits :: [Key] -> [(String, Integer)]
splits ks = M.toList $ M.fromListWith (+) $ map tcount ks
tcount k = (keyBackendName k, 1)
swap (a, b) = (b, a)
pp c [] = c pp c [] = c
pp c ((n, b):xs) = "\n\t" ++ b ++ ": " ++ show n ++ pp c xs pp c ((n, b):xs) = "\n\t" ++ b ++ ": " ++ show n ++ pp c xs
swap (a, b) = (b, a)
cachedKeysPresent :: StatState (Set Key) cachedPresentData :: StatState KeyData
cachedKeysPresent = do cachedPresentData = do
s <- get s <- get
case keysPresentCache s of case presentData s of
Just v -> return v Just v -> return v
Nothing -> do Nothing -> do
keys <- S.fromList <$> lift getKeysPresent v <- foldKeys <$> lift getKeysPresent
put s { keysPresentCache = Just keys } put s { presentData = Just v }
return keys return v
cachedKeysReferenced :: StatState (Set Key) cachedReferencedData :: StatState KeyData
cachedKeysReferenced = do cachedReferencedData = do
s <- get s <- get
case keysReferencedCache s of case referencedData s of
Just v -> return v Just v -> return v
Nothing -> do Nothing -> do
keys <- S.fromList <$> lift Command.Unused.getKeysReferenced !v <- lift $ Command.Unused.withKeysReferenced
put s { keysReferencedCache = Just keys } emptyKeyData addKey
return keys put s { referencedData = Just v }
return v
keySizeSum :: Set Key -> String emptyKeyData :: KeyData
keySizeSum s = total ++ missingnote emptyKeyData = KeyData 0 0 0 M.empty
foldKeys :: [Key] -> KeyData
foldKeys = foldl' (flip addKey) emptyKeyData
addKey :: Key -> KeyData -> KeyData
addKey key (KeyData count size unknownsize backends) =
KeyData count' size' unknownsize' backends'
where where
knownsizes = mapMaybe keySize $ S.toList s {- All calculations strict to avoid thunks when repeatedly
total = roughSize storageUnits False $ sum knownsizes - applied to many keys. -}
missing = S.size s - genericLength knownsizes !count' = count + 1
!backends' = M.insertWith' (+) (keyBackendName key) 1 backends
!size' = maybe size (+ size) ks
!unknownsize' = maybe (unknownsize + 1) (const unknownsize) ks
ks = keySize key
showSizeKeys :: KeyData -> String
showSizeKeys d = total ++ missingnote
where
total = roughSize storageUnits False $ sizeKeys d
missingnote missingnote
| missing == 0 = "" | unknownSizeKeys d == 0 = ""
| otherwise = aside $ | otherwise = aside $
"+ " ++ show missing ++ "+ " ++ show (unknownSizeKeys d) ++
" keys of unknown size" " keys of unknown size"
staleSize :: String -> (Git.Repo -> FilePath) -> Stat staleSize :: String -> (Git.Repo -> FilePath) -> Stat
@ -192,7 +216,7 @@ staleSize label dirspec = do
if null keys if null keys
then nostat then nostat
else stat label $ json (++ aside "clean up with git-annex unused") $ else stat label $ json (++ aside "clean up with git-annex unused") $
return $ keySizeSum $ S.fromList keys return $ showSizeKeys $ foldKeys keys
aside :: String -> String aside :: String -> String
aside s = " (" ++ s ++ ")" aside s = " (" ++ s ++ ")"

View file

@ -1,6 +1,6 @@
{- git-annex command {- git-annex command
- -
- Copyright 2010-2011 Joey Hess <joey@kitenet.net> - Copyright 2010-2012 Joey Hess <joey@kitenet.net>
- -
- Licensed under the GNU GPL version 3 or higher. - Licensed under the GNU GPL version 3 or higher.
-} -}
@ -171,7 +171,7 @@ excludeReferenced l = do
go s (r:rs) go s (r:rs)
| s == S.empty = return [] -- optimisation | s == S.empty = return [] -- optimisation
| otherwise = do | otherwise = do
!s' <- withKeysReferencedInGit r s S.delete s' <- withKeysReferencedInGit r s S.delete
go s' rs go s' rs
{- Finds items in the first, smaller list, that are not {- Finds items in the first, smaller list, that are not
@ -186,21 +186,14 @@ exclude smaller larger = S.toList $ remove larger $ S.fromList smaller
where where
remove a b = foldl (flip S.delete) b a remove a b = foldl (flip S.delete) b a
{- List of keys referenced by symlinks in the git repo. -}
getKeysReferenced :: Annex [Key]
getKeysReferenced = do
top <- fromRepo Git.workTree
files <- inRepo $ LsFiles.inRepo [top]
keypairs <- mapM Backend.lookupFile files
return $ map fst $ catMaybes keypairs
{- Given an initial value, mutates it using an action for each {- Given an initial value, mutates it using an action for each
- key referenced by symlinks in the git repo. -} - key referenced by symlinks in the git repo. -}
withKeysReferenced :: v -> (Key -> v -> v) -> Annex v withKeysReferenced :: v -> (Key -> v -> v) -> Annex v
withKeysReferenced initial a = do withKeysReferenced initial a = go initial =<< files
top <- fromRepo Git.workTree
go initial =<< inRepo (LsFiles.inRepo [top])
where where
files = do
top <- fromRepo Git.workTree
inRepo $ LsFiles.inRepo [top]
go v [] = return v go v [] = return v
go v (f:fs) = do go v (f:fs) = do
x <- Backend.lookupFile f x <- Backend.lookupFile f

1
debian/changelog vendored
View file

@ -6,6 +6,7 @@ git-annex (3.20120310) UNRELEASED; urgency=low
* unused: Reduce memory usage significantly. Still not constant * unused: Reduce memory usage significantly. Still not constant
space, but now only needs to store the set of file contents that space, but now only needs to store the set of file contents that
are present in the annex in memory. are present in the annex in memory.
* status: Fixed to run in nearly constant space.
-- Joey Hess <joeyh@debian.org> Sat, 10 Mar 2012 14:03:22 -0400 -- Joey Hess <joeyh@debian.org> Sat, 10 Mar 2012 14:03:22 -0400