calcBranchRepoSizes without journal files

This will be used to prime the RepoSizes database, which will always
contain values that correpond to information in the git-annex branch, so
without anything from journal files.

Factored out overJournalFileContents which will later be used to
update Annex.reposizes to include information from journal files.
This will be partitcularly important to support private UUIDs which only
ever get to journal files and not to the branch.
This commit is contained in:
Joey Hess 2024-08-14 03:19:30 -04:00
parent 90a79a6c1e
commit 8ac2685b33
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
8 changed files with 103 additions and 61 deletions

View file

@ -38,6 +38,7 @@ module Annex.Branch (
precache,
UnmergedBranches(..),
overBranchFileContents,
overJournalFileContents,
updatedFromTree,
) where
@ -123,7 +124,7 @@ create :: Annex ()
create = void getBranch
{- Returns the sha of the branch, creating it first if necessary. -}
getBranch :: Annex Git.Ref
getBranch :: Annex Git.Sha
getBranch = maybe (hasOrigin >>= go >>= use) return =<< branchsha
where
go True = do
@ -706,8 +707,8 @@ needUpdateIndex branchref = do
return (committedref /= branchref)
{- Record that the branch's index has been updated to correspond to a
- given ref of the branch. -}
setIndexSha :: Git.Ref -> Annex ()
- given sha of the branch. -}
setIndexSha :: Git.Sha -> Annex ()
setIndexSha ref = do
f <- fromRepo gitAnnexIndexStatus
writeLogFile f $ fromRef ref ++ "\n"
@ -994,35 +995,45 @@ data UnmergedBranches t
- The action is passed a callback that it can repeatedly call to read
- the next file and its contents. When there are no more files, the
- callback will return Nothing.
-
- Returns the accumulated result of the callback, as well as the sha of
- the branch at the point it was read.
-}
overBranchFileContents
:: (RawFilePath -> Maybe v)
-> Bool
-- ^ When there are new files in the journal that have not yet
-- been committed to the branch, should those files be omitted?
-- When this is False, the callback is run on each journalled file
-- at the end, and so may be run more than once on the same file.
:: Bool
-- ^ Should files in the journal be ignored? When False,
-- the content of journalled files is combined with files in the
-- git-annex branch. And also, at the end, the callback is run
-- on each journalled file, in case some journalled files are new
-- files that do not yet appear in the branch. Note that this means
-- the callback can be run more than once on the same filename,
-- and in this case it's also possible for the callback to be
-- passed some of the same file content repeatedly.
-> (RawFilePath -> Maybe v)
-> (Annex (Maybe (v, RawFilePath, Maybe L.ByteString)) -> Annex a)
-> Annex (UnmergedBranches a)
overBranchFileContents select omitnewjournalledfiles go = do
-> Annex (UnmergedBranches (a, Git.Sha))
overBranchFileContents ignorejournal select go = do
st <- update
v <- overBranchFileContents' select omitnewjournalledfiles go st
let st' = if ignorejournal
then st { journalIgnorable = True }
else st
v <- overBranchFileContents' select go st'
return $ if not (null (unmergedRefs st))
then UnmergedBranches v
else NoUnmergedBranches v
overBranchFileContents'
:: (RawFilePath -> Maybe v)
-> Bool
-> (Annex (Maybe (v, RawFilePath, Maybe L.ByteString)) -> Annex a)
-> BranchState
-> Annex a
overBranchFileContents' select omitnewjournalledfiles go st = do
-> Annex (a, Git.Sha)
overBranchFileContents' select go st = do
g <- Annex.gitRepo
branchsha <- getBranch
(l, cleanup) <- inRepo $ Git.LsTree.lsTree
Git.LsTree.LsTreeRecursive
(Git.LsTree.LsTreeLong False)
fullname
branchsha
let select' f = fmap (\v -> (v, f)) (select f)
buf <- liftIO newEmptyMVar
let go' reader = go $ liftIO reader >>= \case
@ -1030,24 +1041,12 @@ overBranchFileContents' select omitnewjournalledfiles go st = do
content' <- checkjournal f content
return (Just (v, f, content'))
Nothing
| journalIgnorable st || omitnewjournalledfiles ->
return Nothing
-- The journal did not get committed to the
-- branch, and may contain new files that
-- are not present in the branch, which
-- need to be provided to the action still.
-- This can cause the action to be run a
-- second time with a file it already ran on.
| otherwise -> liftIO (tryTakeMVar buf) >>= \case
Nothing -> do
jfs <- journalledFiles
pjfs <- journalledFilesPrivate
drain buf jfs pjfs
Just (jfs, pjfs) -> drain buf jfs pjfs
catObjectStreamLsTree l (select' . getTopFilePath . Git.LsTree.file) g go'
| journalIgnorable st -> return Nothing
| otherwise -> overJournalFileContents' buf (handlestale branchsha) select
res <- catObjectStreamLsTree l (select' . getTopFilePath . Git.LsTree.file) g go'
`finally` liftIO (void cleanup)
return (res, branchsha)
where
-- Check the journal, in case it did not get committed to the branch
checkjournal f branchcontent
| journalIgnorable st = return branchcontent
| otherwise = getJournalFileStale (GetPrivate True) f >>= return . \case
@ -1056,20 +1055,50 @@ overBranchFileContents' select omitnewjournalledfiles go st = do
Just journalledcontent
PossiblyStaleJournalledContent journalledcontent ->
Just (fromMaybe mempty branchcontent <> journalledcontent)
drain buf fs pfs = case getnext fs pfs of
handlestale branchsha f journalledcontent = do
-- This is expensive, but happens only when there is a
-- private journal file.
content <- getRef branchsha f
return (content <> journalledcontent)
{- Like overBranchFileContents but only reads the content of journalled
- files. Note that when there are private UUIDs, the journal files may
- only include information about the private UUID, while information about
- other UUIDs has been committed to the git-annex branch.
-}
overJournalFileContents
:: (RawFilePath -> Maybe v)
-> (Annex (Maybe (v, RawFilePath, Maybe L.ByteString)) -> Annex a)
-> Annex a
overJournalFileContents select go = do
buf <- liftIO newEmptyMVar
go $ overJournalFileContents' buf handlestale select
where
handlestale _f journalledcontent = return journalledcontent
overJournalFileContents'
:: MVar ([RawFilePath], [RawFilePath])
-> (RawFilePath -> L.ByteString -> Annex L.ByteString)
-> (RawFilePath -> Maybe a)
-> Annex (Maybe (a, RawFilePath, Maybe L.ByteString))
overJournalFileContents' buf handlestale select =
liftIO (tryTakeMVar buf) >>= \case
Nothing -> do
jfs <- journalledFiles
pjfs <- journalledFilesPrivate
drain jfs pjfs
Just (jfs, pjfs) -> drain jfs pjfs
where
drain fs pfs = case getnext fs pfs of
Just (v, f, fs', pfs') -> do
liftIO $ putMVar buf (fs', pfs')
content <- getJournalFileStale (GetPrivate True) f >>= \case
NoJournalledContent -> return Nothing
JournalledContent journalledcontent ->
return (Just journalledcontent)
PossiblyStaleJournalledContent journalledcontent -> do
-- This is expensive, but happens
-- only when there is a private
-- journal file.
content <- getRef fullname f
return (Just (content <> journalledcontent))
PossiblyStaleJournalledContent journalledcontent ->
Just <$> handlestale f journalledcontent
return (Just (v, f, content))
Nothing -> do
liftIO $ putMVar buf ([], [])

View file

@ -592,7 +592,7 @@ gitAnnexIndex r = gitAnnexDir r P.</> "index"
gitAnnexPrivateIndex :: Git.Repo -> RawFilePath
gitAnnexPrivateIndex r = gitAnnexDir r P.</> "index-private"
{- Holds the ref of the git-annex branch that the index was last updated to.
{- Holds the sha of the git-annex branch that the index was last updated to.
-
- The .lck in the name is a historical accident; this is not used as a
- lock. -}

View file

@ -12,26 +12,33 @@ import Annex.Branch (UnmergedBranches(..))
import Types.RepoSize
import Logs.Location
import Logs.UUID
import Git.Types (Sha)
import qualified Data.Map.Strict as M
{- Sum up the sizes of all keys in all repositories, from the information
- in the git-annex branch. New keys that only appear in the journal are
- not included. Can be slow.
- in the git-annex branch, but not the journal. Retuns the sha of the
- branch commit that was used.
-
- The map includes the UUIDs of all known repositories, including
- repositories that are empty.
-}
calcRepoSizes :: Annex (M.Map UUID RepoSize)
calcRepoSizes = do
calcBranchRepoSizes :: Annex (M.Map UUID RepoSize, Sha)
calcBranchRepoSizes = do
knownuuids <- M.keys <$> uuidDescMap
let startmap = M.fromList $ map (\u -> (u, RepoSize 0)) knownuuids
overLocationLogs True startmap accum >>= \case
UnmergedBranches m -> return m
NoUnmergedBranches m -> return m
UnmergedBranches v -> return v
NoUnmergedBranches v -> return v
where
addksz ksz (Just (RepoSize sz)) = Just $ RepoSize $ sz + ksz
addksz ksz Nothing = Just $ RepoSize ksz
accum k locs m = return $
let sz = fromMaybe 0 $ fromKey keySize k
in foldl' (flip $ M.alter $ addksz sz) m locs
{- Given the RepoSizes calculated from the git-annex branch, updates it with
- data from journalled location logs.
-}
journalledRepoSizes :: M.Map UUID RepoSize -> Sha -> Annex (M.Map UUID RepoSize)
journalledRepoSizes m branchsha = undefined --- XXX

View file

@ -288,8 +288,8 @@ withKeyOptions' ko auto mkkeyaction fallbackaction worktreeitems = do
(\reader cont -> checktimelimit (discard reader) cont)
(\k _ () -> keyaction Nothing (SeekInput [], k, mkActionItem k))
>>= \case
Annex.Branch.NoUnmergedBranches () -> return ()
Annex.Branch.UnmergedBranches () -> giveup "This repository is read-only, and there are unmerged git-annex branches, which prevents operating on all keys. (Set annex.merge-annex-branches to false to ignore the unmerged git-annex branches.)"
Annex.Branch.NoUnmergedBranches ((), _) -> return ()
Annex.Branch.UnmergedBranches ((), _) -> giveup "This repository is read-only, and there are unmerged git-annex branches, which prevents operating on all keys. (Set annex.merge-annex-branches to false to ignore the unmerged git-annex branches.)"
runkeyaction getks = do
keyaction <- mkkeyaction

View file

@ -652,7 +652,7 @@ cachedAllRepoData = do
, return (d, rd)
)
case r of
NoUnmergedBranches (!(d, rd)) -> do
NoUnmergedBranches (!(d, rd), _) -> do
let s' = s { allRepoData = Just d, repoData = rd }
put s'
return s'

View file

@ -187,9 +187,9 @@ updateFromLog db@(ImportFeedDbHandle h) (oldtree, currtree)
-- When initially populating the database, this
-- is faster than diffing from the empty tree
-- and looking up every log file.
scanbranch = Annex.Branch.overBranchFileContents toscan False goscan >>= \case
Annex.Branch.NoUnmergedBranches () -> return ()
Annex.Branch.UnmergedBranches () -> scandiff
scanbranch = Annex.Branch.overBranchFileContents False toscan goscan >>= \case
Annex.Branch.NoUnmergedBranches ((), _) -> return ()
Annex.Branch.UnmergedBranches ((), _) -> scandiff
toscan f
| isUrlLog f = Just ()

View file

@ -599,7 +599,7 @@ limitFullyBalanced mu getgroupmap groupname = Right $ MatchFiles
M.lookup g (uuidsByGroup gm)
maxsizes <- getMaxSizes
-- XXX do not calc this every time!
sizemap <- calcRepoSizes
(sizemap, _sha) <- calcBranchRepoSizes
let keysize = fromMaybe 0 (fromKey keySize key)
currentlocs <- S.fromList <$> loggedLocations key
let hasspace u = case (M.lookup u maxsizes, M.lookup u sizemap) of

View file

@ -45,7 +45,7 @@ import Types.Cluster
import Annex.UUID
import Annex.CatFile
import Annex.VectorClock
import Git.Types (RefDate, Ref)
import Git.Types (RefDate, Ref, Sha)
import qualified Annex
import Data.Time.Clock
@ -219,17 +219,23 @@ loggedKeysFor' u = loggedKeys' isthere
return there
{- This is much faster than loggedKeys. -}
overLocationLogs :: Bool -> v -> (Key -> [UUID] -> v -> Annex v) -> Annex (Annex.Branch.UnmergedBranches v)
overLocationLogs omitnewjournalledfiles v =
overLocationLogs' omitnewjournalledfiles v (flip const)
overLocationLogs
:: Bool
-> v
-> (Key -> [UUID]
-> v
-> Annex v)
-> Annex (Annex.Branch.UnmergedBranches (v, Sha))
overLocationLogs ignorejournal v =
overLocationLogs' ignorejournal v (flip const)
overLocationLogs'
:: Bool
-> v
-> (Annex (Maybe (Key, RawFilePath, Maybe L.ByteString)) -> Annex v -> Annex v)
-> (Key -> [UUID] -> v -> Annex v)
-> Annex (Annex.Branch.UnmergedBranches v)
overLocationLogs' omitnewjournalledfiles iv discarder keyaction = do
-> Annex (Annex.Branch.UnmergedBranches (v, Sha))
overLocationLogs' ignorejournal iv discarder keyaction = do
config <- Annex.getGitConfig
clusters <- getClusters
@ -247,7 +253,7 @@ overLocationLogs' omitnewjournalledfiles iv discarder keyaction = do
)
Nothing -> return v
Annex.Branch.overBranchFileContents getk omitnewjournalledfiles (go iv)
Annex.Branch.overBranchFileContents ignorejournal getk (go iv)
-- Cannot import Logs.Cluster due to a cycle.
-- Annex.clusters gets populated when starting up git-annex.