avoid counting size of keys that are in the journal twice

In calcRepoSizes and also git-annex info, when a key was in the journal,
it was passed to the callback twice, so the calculated size was wrong.
This commit is contained in:
Joey Hess 2024-08-13 13:23:39 -04:00
parent 467d80101a
commit 5afbea25e7
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
6 changed files with 25 additions and 20 deletions

View file

@ -994,29 +994,30 @@ data UnmergedBranches t
- The action is passed a callback that it can repeatedly call to read
- the next file and its contents. When there are no more files, the
- callback will return Nothing.
-
- In some cases the callback may return the same file more than once,
- with different content. This happens rarely, only when the journal
- contains additional information, and the last version of the
- file it returns is the most current one.
-}
overBranchFileContents
:: (RawFilePath -> Maybe v)
-> Bool
-- ^ When there are new files in the journal that have not yet
-- been committed to the branch, should those files be omitted?
-- When this is False, the callback is run on each journalled file
-- at the end, and so may be run more than once on the same file.
-> (Annex (Maybe (v, RawFilePath, Maybe L.ByteString)) -> Annex a)
-> Annex (UnmergedBranches a)
overBranchFileContents select go = do
overBranchFileContents select omitnewjournalledfiles go = do
st <- update
v <- overBranchFileContents' select go st
v <- overBranchFileContents' select omitnewjournalledfiles go st
return $ if not (null (unmergedRefs st))
then UnmergedBranches v
else NoUnmergedBranches v
overBranchFileContents'
:: (RawFilePath -> Maybe v)
-> Bool
-> (Annex (Maybe (v, RawFilePath, Maybe L.ByteString)) -> Annex a)
-> BranchState
-> Annex a
overBranchFileContents' select go st = do
overBranchFileContents' select omitnewjournalledfiles go st = do
g <- Annex.gitRepo
(l, cleanup) <- inRepo $ Git.LsTree.lsTree
Git.LsTree.LsTreeRecursive
@ -1029,9 +1030,10 @@ overBranchFileContents' select go st = do
content' <- checkjournal f content
return (Just (v, f, content'))
Nothing
| journalIgnorable st -> return Nothing
| journalIgnorable st || omitnewjournalledfiles ->
return Nothing
-- The journal did not get committed to the
-- branch, and may contain files that
-- branch, and may contain new files that
-- are not present in the branch, which
-- need to be provided to the action still.
-- This can cause the action to be run a

View file

@ -16,7 +16,8 @@ import Logs.UUID
import qualified Data.Map.Strict as M
{- Sum up the sizes of all keys in all repositories, from the information
- in the git-annex branch. Can be slow.
- in the git-annex branch. New keys that only appear in the journal are
- not included. Can be slow.
-
- The map includes the UUIDs of all known repositories, including
- repositories that are empty.
@ -25,7 +26,7 @@ calcRepoSizes :: Annex (M.Map UUID RepoSize)
calcRepoSizes = do
knownuuids <- M.keys <$> uuidDescMap
let startmap = M.fromList $ map (\u -> (u, RepoSize 0)) knownuuids
overLocationLogs startmap accum >>= \case
overLocationLogs True startmap accum >>= \case
UnmergedBranches m -> return m
NoUnmergedBranches m -> return m
where

View file

@ -284,7 +284,7 @@ withKeyOptions' ko auto mkkeyaction fallbackaction worktreeitems = do
let discard reader = reader >>= \case
Nothing -> noop
Just _ -> discard reader
overLocationLogs' ()
overLocationLogs' False ()
(\reader cont -> checktimelimit (discard reader) cont)
(\k _ () -> keyaction Nothing (SeekInput [], k, mkActionItem k))
>>= \case

View file

@ -641,7 +641,7 @@ cachedAllRepoData = do
Just _ -> return s
Nothing -> do
matcher <- lift getKeyOnlyMatcher
r <- lift $ overLocationLogs (emptyKeyInfo, mempty) $ \k locs (d, rd) -> do
r <- lift $ overLocationLogs True (emptyKeyInfo, mempty) $ \k locs (d, rd) -> do
ifM (matchOnKey matcher k)
( do
alivelocs <- snd

View file

@ -187,7 +187,7 @@ updateFromLog db@(ImportFeedDbHandle h) (oldtree, currtree)
-- When initially populating the database, this
-- is faster than diffing from the empty tree
-- and looking up every log file.
scanbranch = Annex.Branch.overBranchFileContents toscan goscan >>= \case
scanbranch = Annex.Branch.overBranchFileContents toscan False goscan >>= \case
Annex.Branch.NoUnmergedBranches () -> return ()
Annex.Branch.UnmergedBranches () -> scandiff

View file

@ -219,15 +219,17 @@ loggedKeysFor' u = loggedKeys' isthere
return there
{- This is much faster than loggedKeys. -}
overLocationLogs :: v -> (Key -> [UUID] -> v -> Annex v) -> Annex (Annex.Branch.UnmergedBranches v)
overLocationLogs v = overLocationLogs' v (flip const)
overLocationLogs :: Bool -> v -> (Key -> [UUID] -> v -> Annex v) -> Annex (Annex.Branch.UnmergedBranches v)
overLocationLogs omitnewjournalledfiles v =
overLocationLogs' omitnewjournalledfiles v (flip const)
overLocationLogs'
:: v
:: Bool
-> v
-> (Annex (Maybe (Key, RawFilePath, Maybe L.ByteString)) -> Annex v -> Annex v)
-> (Key -> [UUID] -> v -> Annex v)
-> Annex (Annex.Branch.UnmergedBranches v)
overLocationLogs' iv discarder keyaction = do
overLocationLogs' omitnewjournalledfiles iv discarder keyaction = do
config <- Annex.getGitConfig
clusters <- getClusters
@ -245,7 +247,7 @@ overLocationLogs' iv discarder keyaction = do
)
Nothing -> return v
Annex.Branch.overBranchFileContents getk (go iv)
Annex.Branch.overBranchFileContents getk omitnewjournalledfiles (go iv)
-- Cannot import Logs.Cluster due to a cycle.
-- Annex.clusters gets populated when starting up git-annex.