git-annex/Annex/RepoSize.hs
Joey Hess 4d2f95853d
closing in on finishing live reposizes
Fixed successfullyFinishedLiveSizeChange to not update the rolling total
when a redundant change is in RecentChanges.

Made setRepoSizes clear RecentChanges that are no longer needed.
It might be possible to clear those earlier, this is only a convenient
point to do it.

The reason it's safe to clear RecentChanges here is that, in order for a
live update to call successfullyFinishedLiveSizeChange, a change must be
made to a location log. If a RecentChange gets cleared, and just after
that a new live update is started, making the same change, the location
log has already been changed (since the RecentChange exists), and
so when the live update succeeds, it won't call
successfullyFinishedLiveSizeChange. The reason it doesn't
clear RecentChanges when there is a reduntant live update is because
I didn't want to think through whether or not all races are avoided in
that case.

The rolling total in SizeChanges is never cleared. Instead,
calcJournalledRepoSizes gets the initial value of it, and then
getLiveRepoSizes subtracts that initial value from the current value.
Since the rolling total can only be updated by updateRepoSize,
which is called with the journal locked, locking the journal in
calcJournalledRepoSizes ensures that the database does not change while
reading the journal.
2024-08-27 12:54:46 -04:00

214 lines
7.2 KiB
Haskell

{- git-annex repo sizes
-
- Copyright 2024 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE OverloadedStrings, BangPatterns #-}
module Annex.RepoSize (
getRepoSizes,
getLiveRepoSizes,
) where
import Annex.Common
import qualified Annex
import Annex.Branch (UnmergedBranches(..), getBranch)
import qualified Database.RepoSize as Db
import Annex.Journal
import Logs
import Logs.Location
import Logs.UUID
import Git.Types (Sha)
import Git.FilePath
import Git.CatFile
import qualified Git.DiffTree as DiffTree
import Control.Concurrent
import Control.Concurrent.Async
import qualified Data.Map.Strict as M
import qualified Data.Set as S
{- Gets the repo size map. Cached for speed.
-
- Note that this is the size of all repositories as of the first time it
- was called. It does not update while git-annex is running.
-}
getRepoSizes :: Bool -> Annex (M.Map UUID RepoSize)
getRepoSizes quiet = M.map fst <$> getRepoSizes' quiet
getRepoSizes' :: Bool -> Annex (M.Map UUID (RepoSize, SizeOffset))
getRepoSizes' quiet = do
rsv <- Annex.getRead Annex.reposizes
liftIO (takeMVar rsv) >>= \case
Just sizemap -> do
liftIO $ putMVar rsv (Just sizemap)
return sizemap
Nothing -> calcRepoSizes quiet rsv
{- Like getRepoSizes, but with live updates. -}
getLiveRepoSizes :: Bool -> Annex (M.Map UUID RepoSize)
getLiveRepoSizes quiet = do
sizemap <- getRepoSizes' quiet
go sizemap `onException` return (M.map fst sizemap)
where
go sizemap = do
h <- Db.getRepoSizeHandle
liveoffsets <- liftIO $ Db.liveRepoOffsets h
let calc u (RepoSize size, SizeOffset startoffset) =
case M.lookup u liveoffsets of
Nothing -> RepoSize size
Just (SizeOffset offset) -> RepoSize $
size + (offset - startoffset)
return $ M.mapWithKey calc sizemap
{- Fills an empty Annex.reposizes MVar with current information
- from the git-annex branch, supplimented with journalled but
- not yet committed information.
-}
calcRepoSizes :: Bool -> MVar (Maybe (M.Map UUID (RepoSize, SizeOffset))) -> Annex (M.Map UUID (RepoSize, SizeOffset))
calcRepoSizes quiet rsv = go `onException` failed
where
go = do
h <- Db.getRepoSizeHandle
(oldsizemap, moldbranchsha) <- liftIO $ Db.getRepoSizes h
!sizemap <- case moldbranchsha of
Nothing -> calculatefromscratch h
Just oldbranchsha -> do
currbranchsha <- getBranch
if oldbranchsha == currbranchsha
then calcJournalledRepoSizes h oldsizemap oldbranchsha
else incrementalupdate h oldsizemap oldbranchsha currbranchsha
liftIO $ putMVar rsv (Just sizemap)
return sizemap
calculatefromscratch h = do
unless quiet $
showSideAction "calculating repository sizes"
(sizemap, branchsha) <- calcBranchRepoSizes
liftIO $ Db.setRepoSizes h sizemap branchsha
calcJournalledRepoSizes h sizemap branchsha
incrementalupdate h oldsizemap oldbranchsha currbranchsha = do
(sizemap, branchsha) <- diffBranchRepoSizes quiet oldsizemap oldbranchsha currbranchsha
liftIO $ Db.setRepoSizes h sizemap branchsha
calcJournalledRepoSizes h sizemap branchsha
failed = do
liftIO $ putMVar rsv (Just M.empty)
return M.empty
{- Sum up the sizes of all keys in all repositories, from the information
- in the git-annex branch, but not the journal. Retuns the sha of the
- branch commit that was used.
-
- The map includes the UUIDs of all known repositories, including
- repositories that are empty. But clusters are not included.
-
- Note that private repositories, which do not get recorded in
- the git-annex branch, will have 0 size. journalledRepoSizes
- takes care of getting repo sizes for those.
-}
calcBranchRepoSizes :: Annex (M.Map UUID RepoSize, Sha)
calcBranchRepoSizes = do
knownuuids <- M.keys <$> uuidDescMap
let startmap = M.fromList $ map (\u -> (u, RepoSize 0)) knownuuids
overLocationLogs True True startmap accumsizes >>= \case
UnmergedBranches v -> return v
NoUnmergedBranches v -> return v
where
accumsizes k locs m = return $
foldl' (flip $ M.alter $ addKeyRepoSize k) m locs
{- Given the RepoSizes calculated from the git-annex branch, updates it with
- data from journalled location logs.
-}
calcJournalledRepoSizes
:: Db.RepoSizeHandle
-> M.Map UUID RepoSize
-> Sha
-> Annex (M.Map UUID (RepoSize, SizeOffset))
calcJournalledRepoSizes h startmap branchsha =
-- Lock the journal to prevent updates to the size offsets
-- in the repository size database while this is processing
-- the journal files.
lockJournal $ \_jl -> do
sizemap <- overLocationLogsJournal startmap branchsha
(\k v m' -> pure (accumRepoSizes k v m'))
Nothing
offsets <- liftIO $ Db.recordedRepoOffsets h
let getoffset u = fromMaybe (SizeOffset 0) $ M.lookup u offsets
return $ M.mapWithKey (\u sz -> (sz, getoffset u)) sizemap
{- Incremental update by diffing. -}
diffBranchRepoSizes :: Bool -> M.Map UUID RepoSize -> Sha -> Sha -> Annex (M.Map UUID RepoSize, Sha)
diffBranchRepoSizes quiet oldsizemap oldbranchsha newbranchsha = do
g <- Annex.gitRepo
catObjectStream g $ \feeder closer reader -> do
(l, cleanup) <- inRepo $
DiffTree.diffTreeRecursive oldbranchsha newbranchsha
feedtid <- liftIO $ async $ do
forM_ l $ feedpairs feeder
closer
newsizemap <- readpairs 100000 reader oldsizemap Nothing
liftIO $ wait feedtid
ifM (liftIO cleanup)
( do
newsizemap' <- addemptyrepos newsizemap
return (newsizemap', newbranchsha)
, return (oldsizemap, oldbranchsha)
)
where
feedpairs feeder ti =
let f = getTopFilePath (DiffTree.file ti)
in case extLogFileKey locationLogExt f of
Nothing -> noop
Just k -> do
feeder (k, DiffTree.srcsha ti)
feeder (k, DiffTree.dstsha ti)
readpairs n reader sizemap Nothing = liftIO reader >>= \case
Just (_k, oldcontent) -> readpairs n reader sizemap (Just oldcontent)
Nothing -> return sizemap
readpairs n reader sizemap (Just oldcontent) = liftIO reader >>= \case
Just (k, newcontent) ->
let prevlog = parselog oldcontent
currlog = parselog newcontent
newlocs = S.difference currlog prevlog
removedlocs = S.difference prevlog currlog
!sizemap' = accumRepoSizes k (newlocs, removedlocs) sizemap
in do
n' <- if quiet
then pure n
else countdownToMessage n $
showSideAction "calculating repository sizes"
readpairs n' reader sizemap' Nothing
Nothing -> return sizemap
parselog = maybe mempty (S.fromList . parseLoggedLocationsWithoutClusters)
addemptyrepos newsizemap = do
knownuuids <- M.keys <$> uuidDescMap
return $ foldl'
(\m u -> M.insertWith (flip const) u (RepoSize 0) m)
newsizemap
knownuuids
addKeyRepoSize :: Key -> Maybe RepoSize -> Maybe RepoSize
addKeyRepoSize k mrs = case mrs of
Just (RepoSize sz) -> Just $ RepoSize $ sz + ksz
Nothing -> Just $ RepoSize ksz
where
ksz = fromMaybe 0 $ fromKey keySize k
removeKeyRepoSize :: Key -> Maybe RepoSize -> Maybe RepoSize
removeKeyRepoSize k mrs = case mrs of
Just (RepoSize sz) -> Just $ RepoSize $ sz - ksz
Nothing -> Nothing
where
ksz = fromMaybe 0 $ fromKey keySize k
accumRepoSizes :: Key -> (S.Set UUID, S.Set UUID) -> M.Map UUID RepoSize -> M.Map UUID RepoSize
accumRepoSizes k (newlocs, removedlocs) sizemap =
let !sizemap' = foldl' (flip $ M.alter $ addKeyRepoSize k) sizemap newlocs
in foldl' (flip $ M.alter $ removeKeyRepoSize k) sizemap' removedlocs