status: Massively sped up; remove --fast mode.

Using Sets is the right thing; they have constant size lookup like my
SizeList, and logn insertation, which beats nub to death.

Runs faster than --fast mode did before, and gives accurate counts.

13 seconds total runtime with a warm cache in a repository with 40 thousand
keys.
This commit is contained in:
Joey Hess 2011-09-20 18:57:05 -04:00
parent cabbefd9d2
commit 9f5c7a246b
3 changed files with 39 additions and 65 deletions

View file

@ -13,8 +13,9 @@ import Data.Maybe
import System.IO import System.IO
import Data.List import Data.List
import qualified Data.Map as M import qualified Data.Map as M
import qualified Data.Set as S
import Data.Set (Set)
import qualified Annex
import qualified Types.Backend as B import qualified Types.Backend as B
import qualified Types.Remote as R import qualified Types.Remote as R
import qualified Remote import qualified Remote
@ -23,33 +24,23 @@ import qualified Git
import Command import Command
import Types import Types
import Utility.DataUnits import Utility.DataUnits
import Utility.Conditional
import Content import Content
import Types.Key import Types.Key
import Locations import Locations
import Backend import Backend
import Messages
-- a named computation that produces a statistic -- a named computation that produces a statistic
type Stat = StatState (Maybe (String, Bool, StatState String)) type Stat = StatState (Maybe (String, StatState String))
-- cached info that multiple Stats may need -- cached info that multiple Stats may need
data StatInfo = StatInfo data StatInfo = StatInfo
{ keysPresentCache :: Maybe (SizeList Key) { keysPresentCache :: Maybe (Set Key)
, keysReferencedCache :: Maybe (SizeList Key) , keysReferencedCache :: Maybe (Set Key)
} }
-- a state monad for running Stats in -- a state monad for running Stats in
type StatState = StateT StatInfo Annex type StatState = StateT StatInfo Annex
-- a list with a known length
-- (Integer is used for the length to avoid
-- blowing up if someone annexed billions of files..)
type SizeList a = ([a], Integer)
sizeList :: [a] -> SizeList a
sizeList l = (l, genericLength l)
command :: [Command] command :: [Command]
command = [repoCommand "status" paramNothing seek command = [repoCommand "status" paramNothing seek
"shows status information about the annex"] "shows status information about the annex"]
@ -76,15 +67,10 @@ stats =
start :: CommandStart start :: CommandStart
start = do start = do
evalStateT (mapM_ showStat stats) (StatInfo Nothing Nothing) evalStateT (mapM_ showStat stats) (StatInfo Nothing Nothing)
fastmode_note
stop stop
fastmode_note :: Annex () stat :: String -> StatState String -> Stat
fastmode_note = whenM (Annex.getState Annex.fast) $ stat desc a = return $ Just (desc, a)
showLongNote "(*) approximate due to fast mode"
stat :: String -> Bool -> StatState String -> Stat
stat desc approx a = return $ Just (desc, approx, a)
nostat :: Stat nostat :: Stat
nostat = return Nothing nostat = return Nothing
@ -92,37 +78,35 @@ nostat = return Nothing
showStat :: Stat -> StatState () showStat :: Stat -> StatState ()
showStat s = calc =<< s showStat s = calc =<< s
where where
calc (Just (desc, approx, a)) = do calc (Just (desc, a)) = do
fast <- lift $ Annex.getState Annex.fast liftIO $ putStr $ desc ++ ": "
let star = if fast && approx then "(*)" else ""
liftIO $ putStr $ desc ++ star ++ ": "
liftIO $ hFlush stdout liftIO $ hFlush stdout
liftIO . putStrLn =<< a liftIO . putStrLn =<< a
calc Nothing = return () calc Nothing = return ()
supported_backends :: Stat supported_backends :: Stat
supported_backends = stat "supported backends" False $ supported_backends = stat "supported backends" $
return $ unwords $ map B.name Backend.list return $ unwords $ map B.name Backend.list
supported_remote_types :: Stat supported_remote_types :: Stat
supported_remote_types = stat "supported remote types" False $ supported_remote_types = stat "supported remote types" $
return $ unwords $ map R.typename Remote.remoteTypes return $ unwords $ map R.typename Remote.remoteTypes
local_annex_size :: Stat local_annex_size :: Stat
local_annex_size = stat "local annex size" False $ local_annex_size = stat "local annex size" $
cachedKeysPresent >>= keySizeSum cachedKeysPresent >>= keySizeSum
total_annex_size :: Stat total_annex_size :: Stat
total_annex_size = stat "total annex size" True $ total_annex_size = stat "total annex size" $
cachedKeysReferenced >>= keySizeSum cachedKeysReferenced >>= keySizeSum
local_annex_keys :: Stat local_annex_keys :: Stat
local_annex_keys = stat "local annex keys" False $ local_annex_keys = stat "local annex keys" $
show . snd <$> cachedKeysPresent show . S.size <$> cachedKeysPresent
total_annex_keys :: Stat total_annex_keys :: Stat
total_annex_keys = stat "total annex keys" True $ total_annex_keys = stat "total annex keys" $
show . snd <$> cachedKeysReferenced show . S.size <$> cachedKeysReferenced
tmp_size :: Stat tmp_size :: Stat
tmp_size = staleSize "temporary directory size" gitAnnexTmpDir tmp_size = staleSize "temporary directory size" gitAnnexTmpDir
@ -131,9 +115,9 @@ bad_data_size :: Stat
bad_data_size = staleSize "bad keys size" gitAnnexBadDir bad_data_size = staleSize "bad keys size" gitAnnexBadDir
backend_usage :: Stat backend_usage :: Stat
backend_usage = stat "backend usage" True $ usage <$> cachedKeysReferenced backend_usage = stat "backend usage" $ usage <$> cachedKeysReferenced
where where
usage (ks, _) = pp "" $ sort $ map swap $ splits ks usage ks = pp "" $ sort $ map swap $ splits $ S.toList ks
splits :: [Key] -> [(String, Integer)] splits :: [Key] -> [(String, Integer)]
splits ks = M.toList $ M.fromListWith (+) $ map tcount ks splits ks = M.toList $ M.fromListWith (+) $ map tcount ks
tcount k = (keyBackendName k, 1) tcount k = (keyBackendName k, 1)
@ -141,48 +125,44 @@ backend_usage = stat "backend usage" True $ usage <$> cachedKeysReferenced
pp c [] = c pp c [] = c
pp c ((n, b):xs) = "\n\t" ++ b ++ ": " ++ show n ++ pp c xs pp c ((n, b):xs) = "\n\t" ++ b ++ ": " ++ show n ++ pp c xs
cachedKeysPresent :: StatState (SizeList Key) cachedKeysPresent :: StatState (Set Key)
cachedKeysPresent = do cachedKeysPresent = do
s <- get s <- get
case keysPresentCache s of case keysPresentCache s of
Just v -> return v Just v -> return v
Nothing -> do Nothing -> do
keys <- lift getKeysPresent keys <- S.fromList <$> lift getKeysPresent
let v = sizeList keys put s { keysPresentCache = Just keys }
put s { keysPresentCache = Just v } return keys
return v
cachedKeysReferenced :: StatState (SizeList Key) cachedKeysReferenced :: StatState (Set Key)
cachedKeysReferenced = do cachedKeysReferenced = do
s <- get s <- get
case keysReferencedCache s of case keysReferencedCache s of
Just v -> return v Just v -> return v
Nothing -> do Nothing -> do
-- A given key may be referenced repeatedly, keys <- S.fromList <$> lift Command.Unused.getKeysReferenced
-- so nub is needed for accuracy, but is slow. put s { keysReferencedCache = Just keys }
keys <- lift Command.Unused.getKeysReferenced return keys
fast <- lift $ Annex.getState Annex.fast
let v = sizeList $ if fast then keys else nub keys
put s { keysReferencedCache = Just v }
return v
keySizeSum :: SizeList Key -> StatState String keySizeSum :: Set Key -> StatState String
keySizeSum (keys, len) = do keySizeSum s = do
let knownsizes = mapMaybe keySize keys let (sizes, unknownsizes) = S.partition isJust $ S.map keySize s
let total = roughSize storageUnits False $ sum knownsizes let total = roughSize storageUnits False $
let missing = len - genericLength knownsizes fromJust $ S.fold (liftM2 (+)) (Just 0) sizes
let num = S.size unknownsizes
return $ total ++ return $ total ++
if missing > 0 if num == 0
then aside $ "but " ++ show missing ++ " keys have unknown size" then ""
else "" else aside $ "but " ++ show num ++ " keys have unknown size"
staleSize :: String -> (Git.Repo -> FilePath) -> Stat staleSize :: String -> (Git.Repo -> FilePath) -> Stat
staleSize label dirspec = do staleSize label dirspec = do
keys <- lift (Command.Unused.staleKeys dirspec) keys <- lift (Command.Unused.staleKeys dirspec)
if null keys if null keys
then nostat then nostat
else stat label False $ do else stat label $ do
s <- keySizeSum $ sizeList keys s <- keySizeSum $ S.fromList keys
return $ s ++ aside "clean up with git-annex unused" return $ s ++ aside "clean up with git-annex unused"
aside :: String -> String aside :: String -> String

3
debian/changelog vendored
View file

@ -12,8 +12,7 @@ git-annex (3.20110916) UNRELEASED; urgency=low
match the specified conditions. match the specified conditions.
* Note that this is a behavior change for git-annex find! Old behavior * Note that this is a behavior change for git-annex find! Old behavior
can be gotten by using: git-annex find --in . can be gotten by using: git-annex find --in .
* status: In --fast mode, all status info is displayed now; but some * status: Massively sped up; remove --fast mode.
of it is only approximate, and is marked as such.
-- Joey Hess <joeyh@debian.org> Sun, 18 Sep 2011 18:25:51 -0400 -- Joey Hess <joeyh@debian.org> Sun, 18 Sep 2011 18:25:51 -0400

View file

@ -238,11 +238,6 @@ subdirectories).
Displays some statistics and other information, including how much data Displays some statistics and other information, including how much data
is in the annex. is in the annex.
Some of the statistics can take a while to generate, and those
come last. You can ctrl-c this command once it's displayed the
information you wanted to see. Or, use --fast to produce statistics
more quickly, but possibly less accurately.
* map * map
Helps you keep track of your repositories, and the connections between them, Helps you keep track of your repositories, and the connections between them,