merge git-annex branch in memory in read-only repository

Improved support for using git-annex in a read-only repository, git-annex
branch information from remotes that cannot be merged into the git-annex
branch will now not crash it, but will be merged in memory.

To avoid this making git-annex behave one way in a read-only repository,
and another way when it can write, it's important that Annex.Branch.get
return the same thing (modulo log file compaction) in both cases.

This manages that mostly. There are some exceptions:

- When there is a transition in one of the remote git-annex branches
  that has not yet been applied to the local or other git-annex branches.
  Transitions are not handled.
- `git-annex log` runs git log on the git-annex branch, and so
  it will not be able to show information coming from the other, not yet
  merged branches.
- Annex.Branch.files only looks at files in the git-annex branch and not
  unmerged branches. This affects git-annex info output.
- Annex.Branch.hs.overBranchFileContents ditto. Affects --all and
  also importfeed (but importfeed cannot work in a read-only repo
  anyway).
- CmdLine.Seek.seekFilteredKeys when precaching location logs.
  Note use of Annex.Branch.fullname
- Database.ContentIdentifier.needsUpdateFromLog and updateFromLog

These warts make this not suitable to be merged yet.

This readonly code path is more expensive, since it has to query several
branches. The value does get cached, but still large queries will be
slower in a read-only repository when there are unmerged git-annex
branches.

When annex.merge-annex-branches=false, updateTo skips doing anything,
and so the read-only repository code does not get triggered. So a user who
is bothered by the extra work can set that.

Other writes to the repository can still result in permissions errors.
This includes the initial creation of the git-annex branch, and of course
any writes to the git-annex branch.

Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
Joey Hess 2021-12-26 14:28:42 -04:00
parent da6aa6e944
commit 6d7ecd9e5d
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
6 changed files with 124 additions and 82 deletions

View file

@ -141,17 +141,12 @@ getBranch = maybe (hasOrigin >>= go >>= use) return =<< branchsha
{- Ensures that the branch and index are up-to-date; should be {- Ensures that the branch and index are up-to-date; should be
- called before data is read from it. Runs only once per git-annex run. -} - called before data is read from it. Runs only once per git-annex run. -}
update :: Annex BranchState update :: Annex BranchState
update = runUpdateOnce $ journalClean <$$> updateTo =<< siblingBranches update = runUpdateOnce $ updateTo =<< siblingBranches
{- Forces an update even if one has already been run. -} {- Forces an update even if one has already been run. -}
forceUpdate :: Annex UpdateMade forceUpdate :: Annex UpdateMade
forceUpdate = updateTo =<< siblingBranches forceUpdate = updateTo =<< siblingBranches
data UpdateMade = UpdateMade
{ refsWereMerged :: Bool
, journalClean :: Bool
}
{- Merges the specified Refs into the index, if they have any changes not {- Merges the specified Refs into the index, if they have any changes not
- already in it. The Branch names are only used in the commit message; - already in it. The Branch names are only used in the commit message;
- it's even possible that the provided Branches have not been updated to - it's even possible that the provided Branches have not been updated to
@ -167,8 +162,6 @@ data UpdateMade = UpdateMade
- -
- Also handles performing any Transitions that have not yet been - Also handles performing any Transitions that have not yet been
- performed, in either the local branch, or the Refs. - performed, in either the local branch, or the Refs.
-
- Returns True if any refs were merged in, False otherwise.
-} -}
updateTo :: [(Git.Sha, Git.Branch)] -> Annex UpdateMade updateTo :: [(Git.Sha, Git.Branch)] -> Annex UpdateMade
updateTo pairs = ifM (annexMergeAnnexBranches <$> Annex.getGitConfig) updateTo pairs = ifM (annexMergeAnnexBranches <$> Annex.getGitConfig)
@ -180,7 +173,6 @@ updateTo' :: [(Git.Sha, Git.Branch)] -> Annex UpdateMade
updateTo' pairs = do updateTo' pairs = do
-- ensure branch exists, and get its current ref -- ensure branch exists, and get its current ref
branchref <- getBranch branchref <- getBranch
dirty <- journalDirty gitAnnexJournalDir
ignoredrefs <- getIgnoredRefs ignoredrefs <- getIgnoredRefs
let unignoredrefs = excludeset ignoredrefs pairs let unignoredrefs = excludeset ignoredrefs pairs
tomerge <- if null unignoredrefs tomerge <- if null unignoredrefs
@ -188,42 +180,50 @@ updateTo' pairs = do
else do else do
mergedrefs <- getMergedRefs mergedrefs <- getMergedRefs
filterM isnewer (excludeset mergedrefs unignoredrefs) filterM isnewer (excludeset mergedrefs unignoredrefs)
journalcleaned <- if null tomerge {- In a read-only repository, catching permission denied lets
{- Even when no refs need to be merged, the index - query operations still work, although they will need to do
- may still be updated if the branch has gotten ahead - additional work since the refs are not merged. -}
- of the index, or just if the journal is dirty. -} catchPermissionDenied
then ifM (needUpdateIndex branchref) (const (return (UpdateFailedPermissions (branchref : map fst tomerge))))
( lockJournal $ \jl -> do (go branchref tomerge)
forceUpdateIndex jl branchref
{- When there are journalled changes
- as well as the branch being updated,
- a commit needs to be done. -}
when dirty $
go branchref dirty [] jl
return True
, if dirty
then ifM (annexAlwaysCommit <$> Annex.getGitConfig)
( do
lockJournal $ go branchref dirty []
return True
, return False
)
else return True
)
else do
lockJournal $ go branchref dirty tomerge
return True
journalclean <- if journalcleaned
then not <$> privateUUIDsKnown
else pure False
return $ UpdateMade
{ refsWereMerged = not (null tomerge)
, journalClean = journalclean
}
where where
excludeset s = filter (\(r, _) -> S.notMember r s) excludeset s = filter (\(r, _) -> S.notMember r s)
isnewer (r, _) = inRepo $ Git.Branch.changed fullname r isnewer (r, _) = inRepo $ Git.Branch.changed fullname r
go branchref dirty tomerge jl = stagejournalwhen dirty jl $ do go branchref tomerge = do
dirty <- journalDirty gitAnnexJournalDir
journalcleaned <- if null tomerge
{- Even when no refs need to be merged, the index
- may still be updated if the branch has gotten ahead
- of the index, or just if the journal is dirty. -}
then ifM (needUpdateIndex branchref)
( lockJournal $ \jl -> do
forceUpdateIndex jl branchref
{- When there are journalled changes
- as well as the branch being updated,
- a commit needs to be done. -}
when dirty $
go' branchref dirty [] jl
return True
, if dirty
then ifM (annexAlwaysCommit <$> Annex.getGitConfig)
( lockJournal $ \jl -> do
go' branchref dirty [] jl
return True
, return False
)
else return True
)
else lockJournal $ \jl -> do
go' branchref dirty tomerge jl
return True
journalclean <- if journalcleaned
then not <$> privateUUIDsKnown
else pure False
return $ UpdateMade
{ refsWereMerged = not (null tomerge)
, journalClean = journalclean
}
go' branchref dirty tomerge jl = stagejournalwhen dirty jl $ do
let (refs, branches) = unzip tomerge let (refs, branches) = unzip tomerge
merge_desc <- if null tomerge merge_desc <- if null tomerge
then commitMessage then commitMessage
@ -254,22 +254,33 @@ updateTo' pairs = do
| otherwise = withIndex a | otherwise = withIndex a
{- Gets the content of a file, which may be in the journal, or in the index {- Gets the content of a file, which may be in the journal, or in the index
- (and committed to the branch). - (and committed to the branch).
-
- Returns an empty string if the file doesn't exist yet.
- -
- Updates the branch if necessary, to ensure the most up-to-date available - Updates the branch if necessary, to ensure the most up-to-date available
- content is returned. - content is returned. When permissions prevent updating the branch,
- - reads the content from the journal, plus the branch, plus all unmerged
- Returns an empty string if the file doesn't exist yet. -} - refs.
-}
get :: RawFilePath -> Annex L.ByteString get :: RawFilePath -> Annex L.ByteString
get file = getCache file >>= \case get file = do
Just content -> return content st <- update
Nothing -> do case getCache file st of
st <- update Just content -> return content
content <- if journalIgnorable st Nothing -> do
then getRef fullname file content <- if journalIgnorable st
else getLocal file then getRef fullname file
setCache file content else if null (unmergedRefs st)
return content then getLocal file
else unmergedbranchfallback (unmergedRefs st)
setCache file content
return content
where
unmergedbranchfallback refs = do
l <- getLocal file
bs <- forM refs $ \ref -> getRef ref file
return (l <> mconcat bs)
{- Used to cache the value of a file, which has been read from the branch {- Used to cache the value of a file, which has been read from the branch
- using some optimised method. The journal has to be checked, in case - using some optimised method. The journal has to be checked, in case
@ -285,7 +296,7 @@ precache file branchcontent = do
JournalledContent journalcontent -> journalcontent JournalledContent journalcontent -> journalcontent
PossiblyStaleJournalledContent journalcontent -> PossiblyStaleJournalledContent journalcontent ->
branchcontent <> journalcontent branchcontent <> journalcontent
Annex.BranchState.setCache file content setCache file content
{- Like get, but does not merge the branch, so the info returned may not {- Like get, but does not merge the branch, so the info returned may not
- reflect changes in remotes. - reflect changes in remotes.
@ -452,7 +463,7 @@ commitIndex' jl branchref message basemessage retrynum parents = do
{- Lists all files on the branch. including ones in the journal {- Lists all files on the branch. including ones in the journal
- that have not been committed yet. There may be duplicates in the list. -} - that have not been committed yet. There may be duplicates in the list. -}
files :: Annex ([RawFilePath], IO Bool) tfiles :: Annex ([RawFilePath], IO Bool)
files = do files = do
_ <- update _ <- update
(bfs, cleanup) <- branchFiles (bfs, cleanup) <- branchFiles

View file

@ -2,7 +2,7 @@
- -
- Runtime state about the git-annex branch, and a small cache. - Runtime state about the git-annex branch, and a small cache.
- -
- Copyright 2011-2020 Joey Hess <id@joeyh.name> - Copyright 2011-2021 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -13,6 +13,7 @@ import Annex.Common
import Types.BranchState import Types.BranchState
import qualified Annex import qualified Annex
import Logs import Logs
import qualified Git
import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy as L
@ -30,29 +31,47 @@ checkIndexOnce a = unlessM (indexChecked <$> getState) $ do
a a
changeState $ \s -> s { indexChecked = True } changeState $ \s -> s { indexChecked = True }
data UpdateMade
= UpdateMade
{ refsWereMerged :: Bool
, journalClean :: Bool
}
| UpdateFailedPermissions
{ refsUnmerged :: [Git.Sha]
}
{- Runs an action to update the branch, if it's not been updated before {- Runs an action to update the branch, if it's not been updated before
- in this run of git-annex. - in this run of git-annex.
- -
- The action should return True if anything that was in the journal
- before got staged (or if the journal was empty). That lets an opmisation
- be done: The journal then does not need to be checked going forward,
- until new information gets written to it.
-
- When interactive access is enabled, the journal is always checked when - When interactive access is enabled, the journal is always checked when
- reading values from the branch, and so this does not need to update - reading values from the branch, and so this does not need to update
- the branch. - the branch.
-
- When the action leaves the journal clean, by staging anything that
- was in it, an optimisation is enabled: The journal does not need to
- be checked going forward, until new information gets written to it.
-
- When the action is unable to update the branch due to a permissions
- problem,
-} -}
runUpdateOnce :: Annex Bool -> Annex BranchState runUpdateOnce :: Annex UpdateMade -> Annex BranchState
runUpdateOnce a = do runUpdateOnce update = do
st <- getState st <- getState
if branchUpdated st || needInteractiveAccess st if branchUpdated st || needInteractiveAccess st
then return st then return st
else do else do
journalstaged <- a um <- update
let stf = \st' -> st' let stf = case um of
{ branchUpdated = True UpdateMade {} -> \st' -> st'
, journalIgnorable = journalstaged { branchUpdated = True
} , journalIgnorable = journalClean um
}
UpdateFailedPermissions {} -> \st' -> st'
{ branchUpdated = True
, journalIgnorable = False
, unmergedRefs = refsUnmerged um
, cachedFileContents = []
}
changeState stf changeState stf
return (stf st) return (stf st)
@ -98,13 +117,13 @@ setCache file content = changeState $ \s -> s
| length l < logFilesToCache = (file, content) : l | length l < logFilesToCache = (file, content) : l
| otherwise = (file, content) : Prelude.init l | otherwise = (file, content) : Prelude.init l
getCache :: RawFilePath -> Annex (Maybe L.ByteString) getCache :: RawFilePath -> BranchState -> Maybe L.ByteString
getCache file = (\st -> go (cachedFileContents st) st) <$> getState getCache file state = go (cachedFileContents state)
where where
go [] _ = Nothing go [] = Nothing
go ((f,c):rest) state go ((f,c):rest)
| f == file && not (needInteractiveAccess state) = Just c | f == file && not (needInteractiveAccess state) = Just c
| otherwise = go rest state | otherwise = go rest
invalidateCache :: Annex () invalidateCache :: Annex ()
invalidateCache = changeState $ \s -> s { cachedFileContents = [] } invalidateCache = changeState $ \s -> s { cachedFileContents = [] }

View file

@ -225,8 +225,9 @@ manualPull currentbranch remotes = do
, return $ Just r , return $ Just r
) )
else return Nothing else return Nothing
haddiverged <- Annex.Branch.refsWereMerged haddiverged <- liftAnnex Annex.Branch.forceUpdate >>= return . \case
<$> liftAnnex Annex.Branch.forceUpdate u@(Annex.Branch.UpdateMade {}) -> Annex.Branch.refsWereMerged u
(Annex.Branch.UpdateFailedPermissions {}) -> True
forM_ remotes $ \r -> forM_ remotes $ \r ->
liftAnnex $ Command.Sync.mergeRemote r liftAnnex $ Command.Sync.mergeRemote r
currentbranch mc def currentbranch mc def

View file

@ -70,8 +70,9 @@ onChange file
| ".lock" `isSuffixOf` file = noop | ".lock" `isSuffixOf` file = noop
| isAnnexBranch file = do | isAnnexBranch file = do
branchChanged branchChanged
diverged <- Annex.Branch.refsWereMerged diverged <- liftAnnex Annex.Branch.forceUpdate >>= return . \case
<$> liftAnnex Annex.Branch.forceUpdate u@(Annex.Branch.UpdateMade {}) -> Annex.Branch.refsWereMerged u
(Annex.Branch.UpdateFailedPermissions {}) -> True
when diverged $ do when diverged $ do
updateExportTreeFromLogAll updateExportTreeFromLogAll
queueDeferredDownloads "retrying deferred download" Later queueDeferredDownloads "retrying deferred download" Later

View file

@ -17,6 +17,10 @@ git-annex (8.20211124) UNRELEASED; urgency=medium
* sync: Better error message when unable to export to a remote because * sync: Better error message when unable to export to a remote because
remote.name.annex-tracking-branch is configured to a ref that does not remote.name.annex-tracking-branch is configured to a ref that does not
exist. exist.
* Improved support for using git-annex in a read-only repository,
git-annex branch information from remotes that cannot be merged into
the git-annex branch will now not crash it, but will be merged in
memory.
-- Joey Hess <id@joeyh.name> Tue, 23 Nov 2021 15:58:27 -0400 -- Joey Hess <id@joeyh.name> Tue, 23 Nov 2021 15:58:27 -0400

View file

@ -1,6 +1,6 @@
{- git-annex BranchState data type {- git-annex BranchState data type
- -
- Copyright 2011-2020 Joey Hess <id@joeyh.name> - Copyright 2011-2021 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -8,17 +8,23 @@
module Types.BranchState where module Types.BranchState where
import Common import Common
import qualified Git
import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy as L
data BranchState = BranchState data BranchState = BranchState
{ branchUpdated :: Bool { branchUpdated :: Bool
-- ^ has the branch been updated this run? -- ^ has the branch been updated this run? (Or an update tried and
-- failed due to permissions.)
, indexChecked :: Bool , indexChecked :: Bool
-- ^ has the index file been checked to exist? -- ^ has the index file been checked to exist?
, journalIgnorable :: Bool , journalIgnorable :: Bool
-- ^ can reading the journal be skipped, while still getting -- ^ can reading the journal be skipped, while still getting
-- sufficiently up-to-date information from the branch? -- sufficiently up-to-date information from the branch?
, unmergedRefs :: [Git.Sha]
-- ^ when the branch was not able to be updated due to permissions,
-- these other git refs contain unmerged information and need to be
-- queried, along with the index and the journal.
, cachedFileContents :: [(RawFilePath, L.ByteString)] , cachedFileContents :: [(RawFilePath, L.ByteString)]
-- ^ contents of a few files recently read from the branch -- ^ contents of a few files recently read from the branch
, needInteractiveAccess :: Bool , needInteractiveAccess :: Bool
@ -29,4 +35,4 @@ data BranchState = BranchState
} }
startBranchState :: BranchState startBranchState :: BranchState
startBranchState = BranchState False False False [] False startBranchState = BranchState False False False [] [] False