update RepoSize database from git-annex branch incrementally

The use of catObjectStream is optimally fast. Although it might be
possible to combine this with git-annex branch merge to avoid some
redundant work.

Benchmarking, a git-annex branch that had 100000 files changed
took less than 1.88 seconds to run through this.
This commit is contained in:
Joey Hess 2024-08-17 13:30:24 -04:00
parent 8239824d92
commit d09a005f2b
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
9 changed files with 115 additions and 33 deletions

View file

@ -17,12 +17,18 @@ import qualified Annex
import Annex.Branch (UnmergedBranches(..), getBranch)
import Types.RepoSize
import qualified Database.RepoSize as Db
import Logs
import Logs.Location
import Logs.UUID
import Git.Types (Sha)
import Git.FilePath
import Git.CatFile
import qualified Git.DiffTree as DiffTree
import Control.Concurrent
import Control.Concurrent.Async
import qualified Data.Map.Strict as M
import qualified Data.Set as S
{- Gets the repo size map. Cached for speed. -}
getRepoSizes :: Annex (M.Map UUID RepoSize)
@ -49,10 +55,7 @@ calcRepoSizes rsv = bracket setup cleanup $ \h -> go h `onException` failed
currbranchsha <- getBranch
if oldbranchsha == currbranchsha
then calcJournalledRepoSizes oldsizemap oldbranchsha
else do
-- XXX todo incremental update by diffing
-- from old to new branch.
calculatefromscratch h
else incrementalupdate h oldsizemap oldbranchsha currbranchsha
liftIO $ putMVar rsv (Just sizemap)
return sizemap
@ -62,6 +65,11 @@ calcRepoSizes rsv = bracket setup cleanup $ \h -> go h `onException` failed
liftIO $ Db.setRepoSizes h sizemap branchsha
calcJournalledRepoSizes sizemap branchsha
incrementalupdate h oldsizemap oldbranchsha currbranchsha = do
(sizemap, branchsha) <- diffBranchRepoSizes oldsizemap oldbranchsha currbranchsha
liftIO $ Db.setRepoSizes h sizemap branchsha
calcJournalledRepoSizes sizemap branchsha
setup = Db.openDb
cleanup = Db.closeDb
@ -75,7 +83,7 @@ calcRepoSizes rsv = bracket setup cleanup $ \h -> go h `onException` failed
- branch commit that was used.
-
- The map includes the UUIDs of all known repositories, including
- repositories that are empty.
- repositories that are empty. But clusters are not included.
-
- Note that private repositories, which do not get recorded in
- the git-annex branch, will have 0 size. journalledRepoSizes
@ -100,8 +108,48 @@ calcJournalledRepoSizes
-> Sha
-> Annex (M.Map UUID RepoSize)
calcJournalledRepoSizes startmap branchsha =
overLocationLogsJournal startmap branchsha accumsizes Nothing
overLocationLogsJournal startmap branchsha
(\k v m -> pure (accumRepoSizes k v m))
Nothing
{- Incremental update by diffing. -}
diffBranchRepoSizes :: M.Map UUID RepoSize -> Sha -> Sha -> Annex (M.Map UUID RepoSize, Sha)
diffBranchRepoSizes oldsizemap oldbranchsha newbranchsha = do
g <- Annex.gitRepo
catObjectStream g $ \feeder closer reader -> do
(l, cleanup) <- inRepo $
DiffTree.diffTreeRecursive oldbranchsha newbranchsha
feedtid <- liftIO $ async $ do
forM_ l $ feedpairs feeder
closer
newsizemap <- readpairs 500000 reader oldsizemap Nothing
liftIO $ wait feedtid
ifM (liftIO cleanup)
( return (newsizemap, newbranchsha)
, return (oldsizemap, oldbranchsha)
)
where
accumsizes k (newlocs, removedlocs) m = return $
let m' = foldl' (flip $ M.alter $ addKeyRepoSize k) m newlocs
in foldl' (flip $ M.alter $ removeKeyRepoSize k) m' removedlocs
feedpairs feeder ti =
let f = getTopFilePath (DiffTree.file ti)
in case extLogFileKey locationLogExt f of
Nothing -> noop
Just k -> do
feeder (k, DiffTree.srcsha ti)
feeder (k, DiffTree.dstsha ti)
readpairs n reader sizemap Nothing = liftIO reader >>= \case
Just (_k, oldcontent) -> readpairs n reader sizemap (Just oldcontent)
Nothing -> return sizemap
readpairs n reader sizemap (Just oldcontent) = liftIO reader >>= \case
Just (k, newcontent) ->
let prevlog = parselog oldcontent
currlog = parselog newcontent
newlocs = S.difference currlog prevlog
removedlocs = S.difference prevlog currlog
!sizemap' = accumRepoSizes k (newlocs, removedlocs) sizemap
in do
n' <- countdownToMessage n $
showSideAction "calculating repository sizes"
readpairs n' reader sizemap' Nothing
Nothing -> return sizemap
parselog = maybe mempty (S.fromList . parseLoggedLocationsWithoutClusters)