status: Massively sped up; remove --fast mode.

Using Sets is the right thing; they have constant size lookup like my
SizeList, and logn insertation, which beats nub to death.

Runs faster than --fast mode did before, and gives accurate counts.

13 seconds total runtime with a warm cache in a repository with 40 thousand
keys.
This commit is contained in:
Joey Hess 2011-09-20 18:57:05 -04:00
parent cabbefd9d2
commit 9f5c7a246b
3 changed files with 39 additions and 65 deletions

View file

@ -13,8 +13,9 @@ import Data.Maybe
import System.IO
import Data.List
import qualified Data.Map as M
import qualified Data.Set as S
import Data.Set (Set)
import qualified Annex
import qualified Types.Backend as B
import qualified Types.Remote as R
import qualified Remote
@ -23,33 +24,23 @@ import qualified Git
import Command
import Types
import Utility.DataUnits
import Utility.Conditional
import Content
import Types.Key
import Locations
import Backend
import Messages
-- a named computation that produces a statistic
type Stat = StatState (Maybe (String, Bool, StatState String))
type Stat = StatState (Maybe (String, StatState String))
-- cached info that multiple Stats may need
data StatInfo = StatInfo
{ keysPresentCache :: Maybe (SizeList Key)
, keysReferencedCache :: Maybe (SizeList Key)
{ keysPresentCache :: Maybe (Set Key)
, keysReferencedCache :: Maybe (Set Key)
}
-- a state monad for running Stats in
type StatState = StateT StatInfo Annex
-- a list with a known length
-- (Integer is used for the length to avoid
-- blowing up if someone annexed billions of files..)
type SizeList a = ([a], Integer)
sizeList :: [a] -> SizeList a
sizeList l = (l, genericLength l)
command :: [Command]
command = [repoCommand "status" paramNothing seek
"shows status information about the annex"]
@ -76,15 +67,10 @@ stats =
start :: CommandStart
start = do
evalStateT (mapM_ showStat stats) (StatInfo Nothing Nothing)
fastmode_note
stop
fastmode_note :: Annex ()
fastmode_note = whenM (Annex.getState Annex.fast) $
showLongNote "(*) approximate due to fast mode"
stat :: String -> Bool -> StatState String -> Stat
stat desc approx a = return $ Just (desc, approx, a)
stat :: String -> StatState String -> Stat
stat desc a = return $ Just (desc, a)
nostat :: Stat
nostat = return Nothing
@ -92,37 +78,35 @@ nostat = return Nothing
showStat :: Stat -> StatState ()
showStat s = calc =<< s
where
calc (Just (desc, approx, a)) = do
fast <- lift $ Annex.getState Annex.fast
let star = if fast && approx then "(*)" else ""
liftIO $ putStr $ desc ++ star ++ ": "
calc (Just (desc, a)) = do
liftIO $ putStr $ desc ++ ": "
liftIO $ hFlush stdout
liftIO . putStrLn =<< a
calc Nothing = return ()
supported_backends :: Stat
supported_backends = stat "supported backends" False $
supported_backends = stat "supported backends" $
return $ unwords $ map B.name Backend.list
supported_remote_types :: Stat
supported_remote_types = stat "supported remote types" False $
supported_remote_types = stat "supported remote types" $
return $ unwords $ map R.typename Remote.remoteTypes
local_annex_size :: Stat
local_annex_size = stat "local annex size" False $
local_annex_size = stat "local annex size" $
cachedKeysPresent >>= keySizeSum
total_annex_size :: Stat
total_annex_size = stat "total annex size" True $
total_annex_size = stat "total annex size" $
cachedKeysReferenced >>= keySizeSum
local_annex_keys :: Stat
local_annex_keys = stat "local annex keys" False $
show . snd <$> cachedKeysPresent
local_annex_keys = stat "local annex keys" $
show . S.size <$> cachedKeysPresent
total_annex_keys :: Stat
total_annex_keys = stat "total annex keys" True $
show . snd <$> cachedKeysReferenced
total_annex_keys = stat "total annex keys" $
show . S.size <$> cachedKeysReferenced
tmp_size :: Stat
tmp_size = staleSize "temporary directory size" gitAnnexTmpDir
@ -131,9 +115,9 @@ bad_data_size :: Stat
bad_data_size = staleSize "bad keys size" gitAnnexBadDir
backend_usage :: Stat
backend_usage = stat "backend usage" True $ usage <$> cachedKeysReferenced
backend_usage = stat "backend usage" $ usage <$> cachedKeysReferenced
where
usage (ks, _) = pp "" $ sort $ map swap $ splits ks
usage ks = pp "" $ sort $ map swap $ splits $ S.toList ks
splits :: [Key] -> [(String, Integer)]
splits ks = M.toList $ M.fromListWith (+) $ map tcount ks
tcount k = (keyBackendName k, 1)
@ -141,48 +125,44 @@ backend_usage = stat "backend usage" True $ usage <$> cachedKeysReferenced
pp c [] = c
pp c ((n, b):xs) = "\n\t" ++ b ++ ": " ++ show n ++ pp c xs
cachedKeysPresent :: StatState (SizeList Key)
cachedKeysPresent :: StatState (Set Key)
cachedKeysPresent = do
s <- get
case keysPresentCache s of
Just v -> return v
Nothing -> do
keys <- lift getKeysPresent
let v = sizeList keys
put s { keysPresentCache = Just v }
return v
keys <- S.fromList <$> lift getKeysPresent
put s { keysPresentCache = Just keys }
return keys
cachedKeysReferenced :: StatState (SizeList Key)
cachedKeysReferenced :: StatState (Set Key)
cachedKeysReferenced = do
s <- get
case keysReferencedCache s of
Just v -> return v
Nothing -> do
-- A given key may be referenced repeatedly,
-- so nub is needed for accuracy, but is slow.
keys <- lift Command.Unused.getKeysReferenced
fast <- lift $ Annex.getState Annex.fast
let v = sizeList $ if fast then keys else nub keys
put s { keysReferencedCache = Just v }
return v
keys <- S.fromList <$> lift Command.Unused.getKeysReferenced
put s { keysReferencedCache = Just keys }
return keys
keySizeSum :: SizeList Key -> StatState String
keySizeSum (keys, len) = do
let knownsizes = mapMaybe keySize keys
let total = roughSize storageUnits False $ sum knownsizes
let missing = len - genericLength knownsizes
keySizeSum :: Set Key -> StatState String
keySizeSum s = do
let (sizes, unknownsizes) = S.partition isJust $ S.map keySize s
let total = roughSize storageUnits False $
fromJust $ S.fold (liftM2 (+)) (Just 0) sizes
let num = S.size unknownsizes
return $ total ++
if missing > 0
then aside $ "but " ++ show missing ++ " keys have unknown size"
else ""
if num == 0
then ""
else aside $ "but " ++ show num ++ " keys have unknown size"
staleSize :: String -> (Git.Repo -> FilePath) -> Stat
staleSize label dirspec = do
keys <- lift (Command.Unused.staleKeys dirspec)
if null keys
then nostat
else stat label False $ do
s <- keySizeSum $ sizeList keys
else stat label $ do
s <- keySizeSum $ S.fromList keys
return $ s ++ aside "clean up with git-annex unused"
aside :: String -> String

3
debian/changelog vendored
View file

@ -12,8 +12,7 @@ git-annex (3.20110916) UNRELEASED; urgency=low
match the specified conditions.
* Note that this is a behavior change for git-annex find! Old behavior
can be gotten by using: git-annex find --in .
* status: In --fast mode, all status info is displayed now; but some
of it is only approximate, and is marked as such.
* status: Massively sped up; remove --fast mode.
-- Joey Hess <joeyh@debian.org> Sun, 18 Sep 2011 18:25:51 -0400

View file

@ -238,11 +238,6 @@ subdirectories).
Displays some statistics and other information, including how much data
is in the annex.
Some of the statistics can take a while to generate, and those
come last. You can ctrl-c this command once it's displayed the
information you wanted to see. Or, use --fast to produce statistics
more quickly, but possibly less accurately.
* map
Helps you keep track of your repositories, and the connections between them,