From 8ac2685b339363a30d6f86f3876baae92966fc30 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Wed, 14 Aug 2024 03:19:30 -0400 Subject: [PATCH] calcBranchRepoSizes without journal files This will be used to prime the RepoSizes database, which will always contain values that correpond to information in the git-annex branch, so without anything from journal files. Factored out overJournalFileContents which will later be used to update Annex.reposizes to include information from journal files. This will be partitcularly important to support private UUIDs which only ever get to journal files and not to the branch. --- Annex/Branch.hs | 109 ++++++++++++++++++++++++++--------------- Annex/Locations.hs | 2 +- Annex/RepoSize.hs | 19 ++++--- CmdLine/Seek.hs | 4 +- Command/Info.hs | 2 +- Database/ImportFeed.hs | 6 +-- Limit.hs | 2 +- Logs/Location.hs | 20 +++++--- 8 files changed, 103 insertions(+), 61 deletions(-) diff --git a/Annex/Branch.hs b/Annex/Branch.hs index 9bf613fe71..ed6f648641 100644 --- a/Annex/Branch.hs +++ b/Annex/Branch.hs @@ -38,6 +38,7 @@ module Annex.Branch ( precache, UnmergedBranches(..), overBranchFileContents, + overJournalFileContents, updatedFromTree, ) where @@ -123,7 +124,7 @@ create :: Annex () create = void getBranch {- Returns the sha of the branch, creating it first if necessary. -} -getBranch :: Annex Git.Ref +getBranch :: Annex Git.Sha getBranch = maybe (hasOrigin >>= go >>= use) return =<< branchsha where go True = do @@ -706,8 +707,8 @@ needUpdateIndex branchref = do return (committedref /= branchref) {- Record that the branch's index has been updated to correspond to a - - given ref of the branch. -} -setIndexSha :: Git.Ref -> Annex () + - given sha of the branch. -} +setIndexSha :: Git.Sha -> Annex () setIndexSha ref = do f <- fromRepo gitAnnexIndexStatus writeLogFile f $ fromRef ref ++ "\n" @@ -994,35 +995,45 @@ data UnmergedBranches t - The action is passed a callback that it can repeatedly call to read - the next file and its contents. When there are no more files, the - callback will return Nothing. + - + - Returns the accumulated result of the callback, as well as the sha of + - the branch at the point it was read. -} overBranchFileContents - :: (RawFilePath -> Maybe v) - -> Bool - -- ^ When there are new files in the journal that have not yet - -- been committed to the branch, should those files be omitted? - -- When this is False, the callback is run on each journalled file - -- at the end, and so may be run more than once on the same file. + :: Bool + -- ^ Should files in the journal be ignored? When False, + -- the content of journalled files is combined with files in the + -- git-annex branch. And also, at the end, the callback is run + -- on each journalled file, in case some journalled files are new + -- files that do not yet appear in the branch. Note that this means + -- the callback can be run more than once on the same filename, + -- and in this case it's also possible for the callback to be + -- passed some of the same file content repeatedly. + -> (RawFilePath -> Maybe v) -> (Annex (Maybe (v, RawFilePath, Maybe L.ByteString)) -> Annex a) - -> Annex (UnmergedBranches a) -overBranchFileContents select omitnewjournalledfiles go = do + -> Annex (UnmergedBranches (a, Git.Sha)) +overBranchFileContents ignorejournal select go = do st <- update - v <- overBranchFileContents' select omitnewjournalledfiles go st + let st' = if ignorejournal + then st { journalIgnorable = True } + else st + v <- overBranchFileContents' select go st' return $ if not (null (unmergedRefs st)) then UnmergedBranches v else NoUnmergedBranches v overBranchFileContents' :: (RawFilePath -> Maybe v) - -> Bool -> (Annex (Maybe (v, RawFilePath, Maybe L.ByteString)) -> Annex a) -> BranchState - -> Annex a -overBranchFileContents' select omitnewjournalledfiles go st = do + -> Annex (a, Git.Sha) +overBranchFileContents' select go st = do g <- Annex.gitRepo + branchsha <- getBranch (l, cleanup) <- inRepo $ Git.LsTree.lsTree Git.LsTree.LsTreeRecursive (Git.LsTree.LsTreeLong False) - fullname + branchsha let select' f = fmap (\v -> (v, f)) (select f) buf <- liftIO newEmptyMVar let go' reader = go $ liftIO reader >>= \case @@ -1030,24 +1041,12 @@ overBranchFileContents' select omitnewjournalledfiles go st = do content' <- checkjournal f content return (Just (v, f, content')) Nothing - | journalIgnorable st || omitnewjournalledfiles -> - return Nothing - -- The journal did not get committed to the - -- branch, and may contain new files that - -- are not present in the branch, which - -- need to be provided to the action still. - -- This can cause the action to be run a - -- second time with a file it already ran on. - | otherwise -> liftIO (tryTakeMVar buf) >>= \case - Nothing -> do - jfs <- journalledFiles - pjfs <- journalledFilesPrivate - drain buf jfs pjfs - Just (jfs, pjfs) -> drain buf jfs pjfs - catObjectStreamLsTree l (select' . getTopFilePath . Git.LsTree.file) g go' + | journalIgnorable st -> return Nothing + | otherwise -> overJournalFileContents' buf (handlestale branchsha) select + res <- catObjectStreamLsTree l (select' . getTopFilePath . Git.LsTree.file) g go' `finally` liftIO (void cleanup) + return (res, branchsha) where - -- Check the journal, in case it did not get committed to the branch checkjournal f branchcontent | journalIgnorable st = return branchcontent | otherwise = getJournalFileStale (GetPrivate True) f >>= return . \case @@ -1056,20 +1055,50 @@ overBranchFileContents' select omitnewjournalledfiles go st = do Just journalledcontent PossiblyStaleJournalledContent journalledcontent -> Just (fromMaybe mempty branchcontent <> journalledcontent) - - drain buf fs pfs = case getnext fs pfs of + + handlestale branchsha f journalledcontent = do + -- This is expensive, but happens only when there is a + -- private journal file. + content <- getRef branchsha f + return (content <> journalledcontent) + +{- Like overBranchFileContents but only reads the content of journalled + - files. Note that when there are private UUIDs, the journal files may + - only include information about the private UUID, while information about + - other UUIDs has been committed to the git-annex branch. + -} +overJournalFileContents + :: (RawFilePath -> Maybe v) + -> (Annex (Maybe (v, RawFilePath, Maybe L.ByteString)) -> Annex a) + -> Annex a +overJournalFileContents select go = do + buf <- liftIO newEmptyMVar + go $ overJournalFileContents' buf handlestale select + where + handlestale _f journalledcontent = return journalledcontent + +overJournalFileContents' + :: MVar ([RawFilePath], [RawFilePath]) + -> (RawFilePath -> L.ByteString -> Annex L.ByteString) + -> (RawFilePath -> Maybe a) + -> Annex (Maybe (a, RawFilePath, Maybe L.ByteString)) +overJournalFileContents' buf handlestale select = + liftIO (tryTakeMVar buf) >>= \case + Nothing -> do + jfs <- journalledFiles + pjfs <- journalledFilesPrivate + drain jfs pjfs + Just (jfs, pjfs) -> drain jfs pjfs + where + drain fs pfs = case getnext fs pfs of Just (v, f, fs', pfs') -> do liftIO $ putMVar buf (fs', pfs') content <- getJournalFileStale (GetPrivate True) f >>= \case NoJournalledContent -> return Nothing JournalledContent journalledcontent -> return (Just journalledcontent) - PossiblyStaleJournalledContent journalledcontent -> do - -- This is expensive, but happens - -- only when there is a private - -- journal file. - content <- getRef fullname f - return (Just (content <> journalledcontent)) + PossiblyStaleJournalledContent journalledcontent -> + Just <$> handlestale f journalledcontent return (Just (v, f, content)) Nothing -> do liftIO $ putMVar buf ([], []) diff --git a/Annex/Locations.hs b/Annex/Locations.hs index 2605f74651..0b1ad4d556 100644 --- a/Annex/Locations.hs +++ b/Annex/Locations.hs @@ -592,7 +592,7 @@ gitAnnexIndex r = gitAnnexDir r P. "index" gitAnnexPrivateIndex :: Git.Repo -> RawFilePath gitAnnexPrivateIndex r = gitAnnexDir r P. "index-private" -{- Holds the ref of the git-annex branch that the index was last updated to. +{- Holds the sha of the git-annex branch that the index was last updated to. - - The .lck in the name is a historical accident; this is not used as a - lock. -} diff --git a/Annex/RepoSize.hs b/Annex/RepoSize.hs index 584785cbb8..936101f02a 100644 --- a/Annex/RepoSize.hs +++ b/Annex/RepoSize.hs @@ -12,26 +12,33 @@ import Annex.Branch (UnmergedBranches(..)) import Types.RepoSize import Logs.Location import Logs.UUID +import Git.Types (Sha) import qualified Data.Map.Strict as M {- Sum up the sizes of all keys in all repositories, from the information - - in the git-annex branch. New keys that only appear in the journal are - - not included. Can be slow. + - in the git-annex branch, but not the journal. Retuns the sha of the + - branch commit that was used. - - The map includes the UUIDs of all known repositories, including - repositories that are empty. -} -calcRepoSizes :: Annex (M.Map UUID RepoSize) -calcRepoSizes = do +calcBranchRepoSizes :: Annex (M.Map UUID RepoSize, Sha) +calcBranchRepoSizes = do knownuuids <- M.keys <$> uuidDescMap let startmap = M.fromList $ map (\u -> (u, RepoSize 0)) knownuuids overLocationLogs True startmap accum >>= \case - UnmergedBranches m -> return m - NoUnmergedBranches m -> return m + UnmergedBranches v -> return v + NoUnmergedBranches v -> return v where addksz ksz (Just (RepoSize sz)) = Just $ RepoSize $ sz + ksz addksz ksz Nothing = Just $ RepoSize ksz accum k locs m = return $ let sz = fromMaybe 0 $ fromKey keySize k in foldl' (flip $ M.alter $ addksz sz) m locs + +{- Given the RepoSizes calculated from the git-annex branch, updates it with + - data from journalled location logs. + -} +journalledRepoSizes :: M.Map UUID RepoSize -> Sha -> Annex (M.Map UUID RepoSize) +journalledRepoSizes m branchsha = undefined --- XXX diff --git a/CmdLine/Seek.hs b/CmdLine/Seek.hs index d9a6b81499..f846695016 100644 --- a/CmdLine/Seek.hs +++ b/CmdLine/Seek.hs @@ -288,8 +288,8 @@ withKeyOptions' ko auto mkkeyaction fallbackaction worktreeitems = do (\reader cont -> checktimelimit (discard reader) cont) (\k _ () -> keyaction Nothing (SeekInput [], k, mkActionItem k)) >>= \case - Annex.Branch.NoUnmergedBranches () -> return () - Annex.Branch.UnmergedBranches () -> giveup "This repository is read-only, and there are unmerged git-annex branches, which prevents operating on all keys. (Set annex.merge-annex-branches to false to ignore the unmerged git-annex branches.)" + Annex.Branch.NoUnmergedBranches ((), _) -> return () + Annex.Branch.UnmergedBranches ((), _) -> giveup "This repository is read-only, and there are unmerged git-annex branches, which prevents operating on all keys. (Set annex.merge-annex-branches to false to ignore the unmerged git-annex branches.)" runkeyaction getks = do keyaction <- mkkeyaction diff --git a/Command/Info.hs b/Command/Info.hs index 2a061efdfb..5906fe3b43 100644 --- a/Command/Info.hs +++ b/Command/Info.hs @@ -652,7 +652,7 @@ cachedAllRepoData = do , return (d, rd) ) case r of - NoUnmergedBranches (!(d, rd)) -> do + NoUnmergedBranches (!(d, rd), _) -> do let s' = s { allRepoData = Just d, repoData = rd } put s' return s' diff --git a/Database/ImportFeed.hs b/Database/ImportFeed.hs index c17eb0ca04..5f35ff3505 100644 --- a/Database/ImportFeed.hs +++ b/Database/ImportFeed.hs @@ -187,9 +187,9 @@ updateFromLog db@(ImportFeedDbHandle h) (oldtree, currtree) -- When initially populating the database, this -- is faster than diffing from the empty tree -- and looking up every log file. - scanbranch = Annex.Branch.overBranchFileContents toscan False goscan >>= \case - Annex.Branch.NoUnmergedBranches () -> return () - Annex.Branch.UnmergedBranches () -> scandiff + scanbranch = Annex.Branch.overBranchFileContents False toscan goscan >>= \case + Annex.Branch.NoUnmergedBranches ((), _) -> return () + Annex.Branch.UnmergedBranches ((), _) -> scandiff toscan f | isUrlLog f = Just () diff --git a/Limit.hs b/Limit.hs index ff988561fe..753aa4469a 100644 --- a/Limit.hs +++ b/Limit.hs @@ -599,7 +599,7 @@ limitFullyBalanced mu getgroupmap groupname = Right $ MatchFiles M.lookup g (uuidsByGroup gm) maxsizes <- getMaxSizes -- XXX do not calc this every time! - sizemap <- calcRepoSizes + (sizemap, _sha) <- calcBranchRepoSizes let keysize = fromMaybe 0 (fromKey keySize key) currentlocs <- S.fromList <$> loggedLocations key let hasspace u = case (M.lookup u maxsizes, M.lookup u sizemap) of diff --git a/Logs/Location.hs b/Logs/Location.hs index c07b19cd2a..ddcbb58234 100644 --- a/Logs/Location.hs +++ b/Logs/Location.hs @@ -45,7 +45,7 @@ import Types.Cluster import Annex.UUID import Annex.CatFile import Annex.VectorClock -import Git.Types (RefDate, Ref) +import Git.Types (RefDate, Ref, Sha) import qualified Annex import Data.Time.Clock @@ -219,17 +219,23 @@ loggedKeysFor' u = loggedKeys' isthere return there {- This is much faster than loggedKeys. -} -overLocationLogs :: Bool -> v -> (Key -> [UUID] -> v -> Annex v) -> Annex (Annex.Branch.UnmergedBranches v) -overLocationLogs omitnewjournalledfiles v = - overLocationLogs' omitnewjournalledfiles v (flip const) +overLocationLogs + :: Bool + -> v + -> (Key -> [UUID] + -> v + -> Annex v) + -> Annex (Annex.Branch.UnmergedBranches (v, Sha)) +overLocationLogs ignorejournal v = + overLocationLogs' ignorejournal v (flip const) overLocationLogs' :: Bool -> v -> (Annex (Maybe (Key, RawFilePath, Maybe L.ByteString)) -> Annex v -> Annex v) -> (Key -> [UUID] -> v -> Annex v) - -> Annex (Annex.Branch.UnmergedBranches v) -overLocationLogs' omitnewjournalledfiles iv discarder keyaction = do + -> Annex (Annex.Branch.UnmergedBranches (v, Sha)) +overLocationLogs' ignorejournal iv discarder keyaction = do config <- Annex.getGitConfig clusters <- getClusters @@ -247,7 +253,7 @@ overLocationLogs' omitnewjournalledfiles iv discarder keyaction = do ) Nothing -> return v - Annex.Branch.overBranchFileContents getk omitnewjournalledfiles (go iv) + Annex.Branch.overBranchFileContents ignorejournal getk (go iv) -- Cannot import Logs.Cluster due to a cycle. -- Annex.clusters gets populated when starting up git-annex.