info: Added calculation of combined annex size of all repositories

Factored out overLocationLogs from CmdLine.Seek, which can calculate this
pretty fast even in a large repo. In my big repo, the time to run git-annex
info went up from 1.33s to 8.5s.

Note that the "backend usage" stats are for annexed files in the working
tree only, not all annexed files. This new data source would let that be
changed, but that would be a confusing behavior change. And I cannot
retitle it either, out of fear something uses the current title (eg parsing
the json).

Also note that, while time says "402108maxresident" in my big repo now,
up from "54092maxresident", top shows the RES constant at 64mb, and it
was 48mb before. So I don't think there is a memory leak. I tried using
deepseq to force full evaluation of addKeyCopies and memory use didn't
change, which also says no memory leak. And indeed, not even calling
addKeyCopies resulted in the same memory use. Probably the increased memory
usage is buffering the stream of data from git in overLocationLogs.

Sponsored-by: Brett Eisenberg on Patreon
This commit is contained in:
Joey Hess 2023-11-08 13:15:00 -04:00
parent 8768966d97
commit 11cc9f1933
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
5 changed files with 90 additions and 24 deletions

View file

@ -12,6 +12,7 @@ git-annex (10.20230927) UNRELEASED; urgency=medium
* Windows: When git-annex init is installing hook scripts, it will
avoid ending lines with CR for portability. Existing hook scripts
that do have CR line endings will not be changed.
* info: Added calculation of combined annex size of all repositories.
-- Joey Hess <id@joeyh.name> Tue, 10 Oct 2023 13:17:31 -0400

View file

@ -276,24 +276,14 @@ withKeyOptions' ko auto mkkeyaction fallbackaction worktreeitems = do
-- those. This significantly speeds up typical operations
-- that need to look at the location log for each key.
runallkeys = do
checktimelimit <- mkCheckTimeLimit
keyaction <- mkkeyaction
config <- Annex.getGitConfig
let getk = locationLogFileKey config
checktimelimit <- mkCheckTimeLimit
let discard reader = reader >>= \case
Nothing -> noop
Just _ -> discard reader
let go reader = reader >>= \case
Just (k, f, content) -> checktimelimit (discard reader) $ do
maybe noop (Annex.Branch.precache f) content
unlessM (checkDead k) $
keyaction Nothing (SeekInput [], k, mkActionItem k)
go reader
Nothing -> return ()
Annex.Branch.overBranchFileContents getk go >>= \case
Just r -> return r
Nothing -> giveup "This repository is read-only, and there are unmerged git-annex branches, which prevents operating on all keys. (Set annex.merge-annex-branches to false to ignore the unmerged git-annex branches.)"
overLocationLogs' ()
(\reader cont -> checktimelimit (discard reader) cont)
(\k _ () -> keyaction Nothing (SeekInput [], k, mkActionItem k))
runkeyaction getks = do
keyaction <- mkkeyaction

View file

@ -89,12 +89,13 @@ data StatInfo = StatInfo
{ presentData :: Maybe KeyInfo
, referencedData :: Maybe KeyInfo
, repoData :: M.Map UUID KeyInfo
, allRepoData :: Maybe KeyInfo
, numCopiesStats :: Maybe NumCopiesStats
, infoOptions :: InfoOptions
}
emptyStatInfo :: InfoOptions -> StatInfo
emptyStatInfo = StatInfo Nothing Nothing M.empty Nothing
emptyStatInfo = StatInfo Nothing Nothing M.empty Nothing Nothing
-- a state monad for running Stats in
type StatState = StateT StatInfo Annex
@ -281,8 +282,9 @@ global_slow_stats =
, local_annex_size
, known_annex_files True
, known_annex_size True
, bloom_info
, total_annex_size
, backend_usage
, bloom_info
]
tree_fast_stats :: Bool -> [FilePath -> Stat]
@ -436,6 +438,11 @@ known_annex_size isworktree =
simpleStat ("size of annexed files in " ++ treeDesc isworktree) $
showSizeKeys =<< cachedReferencedData
total_annex_size :: Stat
total_annex_size =
simpleStat "combined annex size of all repositories" $
showSizeKeys =<< cachedAllRepoData
treeDesc :: Bool -> String
treeDesc True = "working tree"
treeDesc False = "tree"
@ -612,6 +619,23 @@ cachedReferencedData = do
put s { referencedData = Just v }
return v
cachedAllRepoData :: StatState KeyInfo
cachedAllRepoData = do
s <- get
case allRepoData s of
Just v -> return v
Nothing -> do
matcher <- lift getKeyOnlyMatcher
!v <- lift $ overLocationLogs emptyKeyInfo $ \k locs d -> do
numcopies <- genericLength . snd
<$> trustPartition DeadTrusted locs
ifM (matchOnKey matcher k)
( return (addKeyCopies numcopies k d)
, return d
)
put s { allRepoData = Just v }
return v
-- currently only available for directory info
cachedNumCopiesStats :: StatState (Maybe NumCopiesStats)
cachedNumCopiesStats = numCopiesStats <$> get
@ -627,7 +651,13 @@ getDirStatInfo o dir = do
(presentdata, referenceddata, numcopiesstats, repodata) <-
Command.Unused.withKeysFilesReferencedIn dir initial
(update matcher fast)
return $ StatInfo (Just presentdata) (Just referenceddata) repodata (Just numcopiesstats) o
return $ StatInfo
(Just presentdata)
(Just referenceddata)
repodata
Nothing
(Just numcopiesstats)
o
where
initial = (emptyKeyInfo, emptyKeyInfo, emptyNumCopiesStats, M.empty)
update matcher fast key file vs@(presentdata, referenceddata, numcopiesstats, repodata) =
@ -663,7 +693,7 @@ getTreeStatInfo o r = do
(presentdata, referenceddata, repodata) <- go fast matcher ls initial
ifM (liftIO cleanup)
( return $ Just $
StatInfo (Just presentdata) (Just referenceddata) repodata Nothing o
StatInfo (Just presentdata) (Just referenceddata) repodata Nothing Nothing o
, return Nothing
)
where
@ -695,16 +725,19 @@ emptyNumCopiesStats :: NumCopiesStats
emptyNumCopiesStats = NumCopiesStats M.empty
addKey :: Key -> KeyInfo -> KeyInfo
addKey key (KeyInfo count size unknownsize backends) =
addKey = addKeyCopies 1
addKeyCopies :: Integer -> Key -> KeyInfo -> KeyInfo
addKeyCopies numcopies key (KeyInfo count size unknownsize backends) =
KeyInfo count' size' unknownsize' backends'
where
{- All calculations strict to avoid thunks when repeatedly
- applied to many keys. -}
!count' = count + 1
!backends' = M.insertWith (+) (fromKey keyVariety key) 1 backends
!size' = maybe size (+ size) ks
!size' = maybe size (\sz -> sz * numcopies + size) ks
!unknownsize' = maybe (unknownsize + 1) (const unknownsize) ks
ks = fromKey keySize key
!ks = fromKey keySize key
updateRepoData :: Key -> [UUID] -> M.Map UUID KeyInfo -> M.Map UUID KeyInfo
updateRepoData key locs m = m'
@ -776,3 +809,5 @@ matchOnKey matcher k = matcher $ MatchingInfo $ ProvidedInfo
, providedMimeEncoding = Nothing
, providedLinkType = Nothing
}

View file

@ -8,11 +8,13 @@
- Repositories record their UUID and the date when they --get or --drop
- a value.
-
- Copyright 2010-2021 Joey Hess <id@joeyh.name>
- Copyright 2010-2023 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE BangPatterns #-}
module Logs.Location (
LogStatus(..),
logStatus,
@ -29,6 +31,8 @@ module Logs.Location (
loggedKeys,
loggedKeysFor,
loggedKeysFor',
overLocationLogs,
overLocationLogs',
) where
import Annex.Common
@ -42,6 +46,7 @@ import Git.Types (RefDate, Ref)
import qualified Annex
import Data.Time.Clock
import qualified Data.ByteString.Lazy as L
{- Log a change in the presence of a key's value in current repository. -}
logStatus :: Key -> LogStatus -> Annex ()
@ -83,6 +88,11 @@ loggedLocationsHistorical = getLoggedLocations . historicalLogInfo
loggedLocationsRef :: Ref -> Annex [UUID]
loggedLocationsRef ref = map (toUUID . fromLogInfo) . getLog <$> catObject ref
{- Parses the content of a log file and gets the locations in it. -}
parseLoggedLocations :: L.ByteString -> [UUID]
parseLoggedLocations l = map (toUUID . fromLogInfo . info)
(filterPresent (parseLog l))
getLoggedLocations :: (RawFilePath -> Annex [LogInfo]) -> Key -> Annex [UUID]
getLoggedLocations getter key = do
config <- Annex.getGitConfig
@ -174,3 +184,33 @@ loggedKeysFor' u = loggedKeys' isthere
us <- loggedLocations k
let !there = u `elem` us
return there
{- This is much faster than loggedKeys. -}
overLocationLogs :: v -> (Key -> [UUID] -> v -> Annex v) -> Annex v
overLocationLogs v = overLocationLogs' v (flip const)
overLocationLogs'
:: v
-> (Annex (Maybe (Key, RawFilePath, Maybe L.ByteString)) -> Annex v -> Annex v)
-> (Key -> [UUID] -> v -> Annex v)
-> Annex v
overLocationLogs' iv discarder keyaction = do
config <- Annex.getGitConfig
let getk = locationLogFileKey config
let go v reader = reader >>= \case
Just (k, f, content) -> discarder reader $ do
-- precache to make checkDead fast, and also to
-- make any accesses done in keyaction fast.
maybe noop (Annex.Branch.precache f) content
ifM (checkDead k)
( go v reader
, do
!v' <- keyaction k (maybe [] parseLoggedLocations content) v
go v' reader
)
Nothing -> return v
Annex.Branch.overBranchFileContents getk (go iv) >>= \case
Just r -> return r
Nothing -> giveup "This repository is read-only, and there are unmerged git-annex branches, which prevents operating on all keys. (Set annex.merge-annex-branches to false to ignore the unmerged git-annex branches.)"

View file

@ -12,8 +12,8 @@ Displays statistics and other information for the specified item.
When no item is specified, displays overall information. This includes a
list of all known repositories, how much annexed data is present in the
local repository, and the total size of all annexed data in the working
tree.
local repository, the total size of all annexed data in the working
tree, and the combined size of all annexed data in all known repositories.
When a directory is specified, displays information
about the annexed files in that directory (and subdirectories).