calcBranchRepoSizes without journal files

This will be used to prime the RepoSizes database, which will always
contain values that correpond to information in the git-annex branch, so
without anything from journal files.

Factored out overJournalFileContents which will later be used to
update Annex.reposizes to include information from journal files.
This will be partitcularly important to support private UUIDs which only
ever get to journal files and not to the branch.
This commit is contained in:
Joey Hess 2024-08-14 03:19:30 -04:00
parent 90a79a6c1e
commit 8ac2685b33
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
8 changed files with 103 additions and 61 deletions

View file

@ -38,6 +38,7 @@ module Annex.Branch (
precache, precache,
UnmergedBranches(..), UnmergedBranches(..),
overBranchFileContents, overBranchFileContents,
overJournalFileContents,
updatedFromTree, updatedFromTree,
) where ) where
@ -123,7 +124,7 @@ create :: Annex ()
create = void getBranch create = void getBranch
{- Returns the sha of the branch, creating it first if necessary. -} {- Returns the sha of the branch, creating it first if necessary. -}
getBranch :: Annex Git.Ref getBranch :: Annex Git.Sha
getBranch = maybe (hasOrigin >>= go >>= use) return =<< branchsha getBranch = maybe (hasOrigin >>= go >>= use) return =<< branchsha
where where
go True = do go True = do
@ -706,8 +707,8 @@ needUpdateIndex branchref = do
return (committedref /= branchref) return (committedref /= branchref)
{- Record that the branch's index has been updated to correspond to a {- Record that the branch's index has been updated to correspond to a
- given ref of the branch. -} - given sha of the branch. -}
setIndexSha :: Git.Ref -> Annex () setIndexSha :: Git.Sha -> Annex ()
setIndexSha ref = do setIndexSha ref = do
f <- fromRepo gitAnnexIndexStatus f <- fromRepo gitAnnexIndexStatus
writeLogFile f $ fromRef ref ++ "\n" writeLogFile f $ fromRef ref ++ "\n"
@ -994,35 +995,45 @@ data UnmergedBranches t
- The action is passed a callback that it can repeatedly call to read - The action is passed a callback that it can repeatedly call to read
- the next file and its contents. When there are no more files, the - the next file and its contents. When there are no more files, the
- callback will return Nothing. - callback will return Nothing.
-
- Returns the accumulated result of the callback, as well as the sha of
- the branch at the point it was read.
-} -}
overBranchFileContents overBranchFileContents
:: (RawFilePath -> Maybe v) :: Bool
-> Bool -- ^ Should files in the journal be ignored? When False,
-- ^ When there are new files in the journal that have not yet -- the content of journalled files is combined with files in the
-- been committed to the branch, should those files be omitted? -- git-annex branch. And also, at the end, the callback is run
-- When this is False, the callback is run on each journalled file -- on each journalled file, in case some journalled files are new
-- at the end, and so may be run more than once on the same file. -- files that do not yet appear in the branch. Note that this means
-- the callback can be run more than once on the same filename,
-- and in this case it's also possible for the callback to be
-- passed some of the same file content repeatedly.
-> (RawFilePath -> Maybe v)
-> (Annex (Maybe (v, RawFilePath, Maybe L.ByteString)) -> Annex a) -> (Annex (Maybe (v, RawFilePath, Maybe L.ByteString)) -> Annex a)
-> Annex (UnmergedBranches a) -> Annex (UnmergedBranches (a, Git.Sha))
overBranchFileContents select omitnewjournalledfiles go = do overBranchFileContents ignorejournal select go = do
st <- update st <- update
v <- overBranchFileContents' select omitnewjournalledfiles go st let st' = if ignorejournal
then st { journalIgnorable = True }
else st
v <- overBranchFileContents' select go st'
return $ if not (null (unmergedRefs st)) return $ if not (null (unmergedRefs st))
then UnmergedBranches v then UnmergedBranches v
else NoUnmergedBranches v else NoUnmergedBranches v
overBranchFileContents' overBranchFileContents'
:: (RawFilePath -> Maybe v) :: (RawFilePath -> Maybe v)
-> Bool
-> (Annex (Maybe (v, RawFilePath, Maybe L.ByteString)) -> Annex a) -> (Annex (Maybe (v, RawFilePath, Maybe L.ByteString)) -> Annex a)
-> BranchState -> BranchState
-> Annex a -> Annex (a, Git.Sha)
overBranchFileContents' select omitnewjournalledfiles go st = do overBranchFileContents' select go st = do
g <- Annex.gitRepo g <- Annex.gitRepo
branchsha <- getBranch
(l, cleanup) <- inRepo $ Git.LsTree.lsTree (l, cleanup) <- inRepo $ Git.LsTree.lsTree
Git.LsTree.LsTreeRecursive Git.LsTree.LsTreeRecursive
(Git.LsTree.LsTreeLong False) (Git.LsTree.LsTreeLong False)
fullname branchsha
let select' f = fmap (\v -> (v, f)) (select f) let select' f = fmap (\v -> (v, f)) (select f)
buf <- liftIO newEmptyMVar buf <- liftIO newEmptyMVar
let go' reader = go $ liftIO reader >>= \case let go' reader = go $ liftIO reader >>= \case
@ -1030,24 +1041,12 @@ overBranchFileContents' select omitnewjournalledfiles go st = do
content' <- checkjournal f content content' <- checkjournal f content
return (Just (v, f, content')) return (Just (v, f, content'))
Nothing Nothing
| journalIgnorable st || omitnewjournalledfiles -> | journalIgnorable st -> return Nothing
return Nothing | otherwise -> overJournalFileContents' buf (handlestale branchsha) select
-- The journal did not get committed to the res <- catObjectStreamLsTree l (select' . getTopFilePath . Git.LsTree.file) g go'
-- branch, and may contain new files that
-- are not present in the branch, which
-- need to be provided to the action still.
-- This can cause the action to be run a
-- second time with a file it already ran on.
| otherwise -> liftIO (tryTakeMVar buf) >>= \case
Nothing -> do
jfs <- journalledFiles
pjfs <- journalledFilesPrivate
drain buf jfs pjfs
Just (jfs, pjfs) -> drain buf jfs pjfs
catObjectStreamLsTree l (select' . getTopFilePath . Git.LsTree.file) g go'
`finally` liftIO (void cleanup) `finally` liftIO (void cleanup)
return (res, branchsha)
where where
-- Check the journal, in case it did not get committed to the branch
checkjournal f branchcontent checkjournal f branchcontent
| journalIgnorable st = return branchcontent | journalIgnorable st = return branchcontent
| otherwise = getJournalFileStale (GetPrivate True) f >>= return . \case | otherwise = getJournalFileStale (GetPrivate True) f >>= return . \case
@ -1057,19 +1056,49 @@ overBranchFileContents' select omitnewjournalledfiles go st = do
PossiblyStaleJournalledContent journalledcontent -> PossiblyStaleJournalledContent journalledcontent ->
Just (fromMaybe mempty branchcontent <> journalledcontent) Just (fromMaybe mempty branchcontent <> journalledcontent)
drain buf fs pfs = case getnext fs pfs of handlestale branchsha f journalledcontent = do
-- This is expensive, but happens only when there is a
-- private journal file.
content <- getRef branchsha f
return (content <> journalledcontent)
{- Like overBranchFileContents but only reads the content of journalled
- files. Note that when there are private UUIDs, the journal files may
- only include information about the private UUID, while information about
- other UUIDs has been committed to the git-annex branch.
-}
overJournalFileContents
:: (RawFilePath -> Maybe v)
-> (Annex (Maybe (v, RawFilePath, Maybe L.ByteString)) -> Annex a)
-> Annex a
overJournalFileContents select go = do
buf <- liftIO newEmptyMVar
go $ overJournalFileContents' buf handlestale select
where
handlestale _f journalledcontent = return journalledcontent
overJournalFileContents'
:: MVar ([RawFilePath], [RawFilePath])
-> (RawFilePath -> L.ByteString -> Annex L.ByteString)
-> (RawFilePath -> Maybe a)
-> Annex (Maybe (a, RawFilePath, Maybe L.ByteString))
overJournalFileContents' buf handlestale select =
liftIO (tryTakeMVar buf) >>= \case
Nothing -> do
jfs <- journalledFiles
pjfs <- journalledFilesPrivate
drain jfs pjfs
Just (jfs, pjfs) -> drain jfs pjfs
where
drain fs pfs = case getnext fs pfs of
Just (v, f, fs', pfs') -> do Just (v, f, fs', pfs') -> do
liftIO $ putMVar buf (fs', pfs') liftIO $ putMVar buf (fs', pfs')
content <- getJournalFileStale (GetPrivate True) f >>= \case content <- getJournalFileStale (GetPrivate True) f >>= \case
NoJournalledContent -> return Nothing NoJournalledContent -> return Nothing
JournalledContent journalledcontent -> JournalledContent journalledcontent ->
return (Just journalledcontent) return (Just journalledcontent)
PossiblyStaleJournalledContent journalledcontent -> do PossiblyStaleJournalledContent journalledcontent ->
-- This is expensive, but happens Just <$> handlestale f journalledcontent
-- only when there is a private
-- journal file.
content <- getRef fullname f
return (Just (content <> journalledcontent))
return (Just (v, f, content)) return (Just (v, f, content))
Nothing -> do Nothing -> do
liftIO $ putMVar buf ([], []) liftIO $ putMVar buf ([], [])

View file

@ -592,7 +592,7 @@ gitAnnexIndex r = gitAnnexDir r P.</> "index"
gitAnnexPrivateIndex :: Git.Repo -> RawFilePath gitAnnexPrivateIndex :: Git.Repo -> RawFilePath
gitAnnexPrivateIndex r = gitAnnexDir r P.</> "index-private" gitAnnexPrivateIndex r = gitAnnexDir r P.</> "index-private"
{- Holds the ref of the git-annex branch that the index was last updated to. {- Holds the sha of the git-annex branch that the index was last updated to.
- -
- The .lck in the name is a historical accident; this is not used as a - The .lck in the name is a historical accident; this is not used as a
- lock. -} - lock. -}

View file

@ -12,26 +12,33 @@ import Annex.Branch (UnmergedBranches(..))
import Types.RepoSize import Types.RepoSize
import Logs.Location import Logs.Location
import Logs.UUID import Logs.UUID
import Git.Types (Sha)
import qualified Data.Map.Strict as M import qualified Data.Map.Strict as M
{- Sum up the sizes of all keys in all repositories, from the information {- Sum up the sizes of all keys in all repositories, from the information
- in the git-annex branch. New keys that only appear in the journal are - in the git-annex branch, but not the journal. Retuns the sha of the
- not included. Can be slow. - branch commit that was used.
- -
- The map includes the UUIDs of all known repositories, including - The map includes the UUIDs of all known repositories, including
- repositories that are empty. - repositories that are empty.
-} -}
calcRepoSizes :: Annex (M.Map UUID RepoSize) calcBranchRepoSizes :: Annex (M.Map UUID RepoSize, Sha)
calcRepoSizes = do calcBranchRepoSizes = do
knownuuids <- M.keys <$> uuidDescMap knownuuids <- M.keys <$> uuidDescMap
let startmap = M.fromList $ map (\u -> (u, RepoSize 0)) knownuuids let startmap = M.fromList $ map (\u -> (u, RepoSize 0)) knownuuids
overLocationLogs True startmap accum >>= \case overLocationLogs True startmap accum >>= \case
UnmergedBranches m -> return m UnmergedBranches v -> return v
NoUnmergedBranches m -> return m NoUnmergedBranches v -> return v
where where
addksz ksz (Just (RepoSize sz)) = Just $ RepoSize $ sz + ksz addksz ksz (Just (RepoSize sz)) = Just $ RepoSize $ sz + ksz
addksz ksz Nothing = Just $ RepoSize ksz addksz ksz Nothing = Just $ RepoSize ksz
accum k locs m = return $ accum k locs m = return $
let sz = fromMaybe 0 $ fromKey keySize k let sz = fromMaybe 0 $ fromKey keySize k
in foldl' (flip $ M.alter $ addksz sz) m locs in foldl' (flip $ M.alter $ addksz sz) m locs
{- Given the RepoSizes calculated from the git-annex branch, updates it with
- data from journalled location logs.
-}
journalledRepoSizes :: M.Map UUID RepoSize -> Sha -> Annex (M.Map UUID RepoSize)
journalledRepoSizes m branchsha = undefined --- XXX

View file

@ -288,8 +288,8 @@ withKeyOptions' ko auto mkkeyaction fallbackaction worktreeitems = do
(\reader cont -> checktimelimit (discard reader) cont) (\reader cont -> checktimelimit (discard reader) cont)
(\k _ () -> keyaction Nothing (SeekInput [], k, mkActionItem k)) (\k _ () -> keyaction Nothing (SeekInput [], k, mkActionItem k))
>>= \case >>= \case
Annex.Branch.NoUnmergedBranches () -> return () Annex.Branch.NoUnmergedBranches ((), _) -> return ()
Annex.Branch.UnmergedBranches () -> giveup "This repository is read-only, and there are unmerged git-annex branches, which prevents operating on all keys. (Set annex.merge-annex-branches to false to ignore the unmerged git-annex branches.)" Annex.Branch.UnmergedBranches ((), _) -> giveup "This repository is read-only, and there are unmerged git-annex branches, which prevents operating on all keys. (Set annex.merge-annex-branches to false to ignore the unmerged git-annex branches.)"
runkeyaction getks = do runkeyaction getks = do
keyaction <- mkkeyaction keyaction <- mkkeyaction

View file

@ -652,7 +652,7 @@ cachedAllRepoData = do
, return (d, rd) , return (d, rd)
) )
case r of case r of
NoUnmergedBranches (!(d, rd)) -> do NoUnmergedBranches (!(d, rd), _) -> do
let s' = s { allRepoData = Just d, repoData = rd } let s' = s { allRepoData = Just d, repoData = rd }
put s' put s'
return s' return s'

View file

@ -187,9 +187,9 @@ updateFromLog db@(ImportFeedDbHandle h) (oldtree, currtree)
-- When initially populating the database, this -- When initially populating the database, this
-- is faster than diffing from the empty tree -- is faster than diffing from the empty tree
-- and looking up every log file. -- and looking up every log file.
scanbranch = Annex.Branch.overBranchFileContents toscan False goscan >>= \case scanbranch = Annex.Branch.overBranchFileContents False toscan goscan >>= \case
Annex.Branch.NoUnmergedBranches () -> return () Annex.Branch.NoUnmergedBranches ((), _) -> return ()
Annex.Branch.UnmergedBranches () -> scandiff Annex.Branch.UnmergedBranches ((), _) -> scandiff
toscan f toscan f
| isUrlLog f = Just () | isUrlLog f = Just ()

View file

@ -599,7 +599,7 @@ limitFullyBalanced mu getgroupmap groupname = Right $ MatchFiles
M.lookup g (uuidsByGroup gm) M.lookup g (uuidsByGroup gm)
maxsizes <- getMaxSizes maxsizes <- getMaxSizes
-- XXX do not calc this every time! -- XXX do not calc this every time!
sizemap <- calcRepoSizes (sizemap, _sha) <- calcBranchRepoSizes
let keysize = fromMaybe 0 (fromKey keySize key) let keysize = fromMaybe 0 (fromKey keySize key)
currentlocs <- S.fromList <$> loggedLocations key currentlocs <- S.fromList <$> loggedLocations key
let hasspace u = case (M.lookup u maxsizes, M.lookup u sizemap) of let hasspace u = case (M.lookup u maxsizes, M.lookup u sizemap) of

View file

@ -45,7 +45,7 @@ import Types.Cluster
import Annex.UUID import Annex.UUID
import Annex.CatFile import Annex.CatFile
import Annex.VectorClock import Annex.VectorClock
import Git.Types (RefDate, Ref) import Git.Types (RefDate, Ref, Sha)
import qualified Annex import qualified Annex
import Data.Time.Clock import Data.Time.Clock
@ -219,17 +219,23 @@ loggedKeysFor' u = loggedKeys' isthere
return there return there
{- This is much faster than loggedKeys. -} {- This is much faster than loggedKeys. -}
overLocationLogs :: Bool -> v -> (Key -> [UUID] -> v -> Annex v) -> Annex (Annex.Branch.UnmergedBranches v) overLocationLogs
overLocationLogs omitnewjournalledfiles v = :: Bool
overLocationLogs' omitnewjournalledfiles v (flip const) -> v
-> (Key -> [UUID]
-> v
-> Annex v)
-> Annex (Annex.Branch.UnmergedBranches (v, Sha))
overLocationLogs ignorejournal v =
overLocationLogs' ignorejournal v (flip const)
overLocationLogs' overLocationLogs'
:: Bool :: Bool
-> v -> v
-> (Annex (Maybe (Key, RawFilePath, Maybe L.ByteString)) -> Annex v -> Annex v) -> (Annex (Maybe (Key, RawFilePath, Maybe L.ByteString)) -> Annex v -> Annex v)
-> (Key -> [UUID] -> v -> Annex v) -> (Key -> [UUID] -> v -> Annex v)
-> Annex (Annex.Branch.UnmergedBranches v) -> Annex (Annex.Branch.UnmergedBranches (v, Sha))
overLocationLogs' omitnewjournalledfiles iv discarder keyaction = do overLocationLogs' ignorejournal iv discarder keyaction = do
config <- Annex.getGitConfig config <- Annex.getGitConfig
clusters <- getClusters clusters <- getClusters
@ -247,7 +253,7 @@ overLocationLogs' omitnewjournalledfiles iv discarder keyaction = do
) )
Nothing -> return v Nothing -> return v
Annex.Branch.overBranchFileContents getk omitnewjournalledfiles (go iv) Annex.Branch.overBranchFileContents ignorejournal getk (go iv)
-- Cannot import Logs.Cluster due to a cycle. -- Cannot import Logs.Cluster due to a cycle.
-- Annex.clusters gets populated when starting up git-annex. -- Annex.clusters gets populated when starting up git-annex.