diff --git a/Annex/Branch.hs b/Annex/Branch.hs index af1d55df35..e8b4d77694 100644 --- a/Annex/Branch.hs +++ b/Annex/Branch.hs @@ -31,6 +31,8 @@ module Annex.Branch ( rememberTreeish, performTransitions, withIndex, + precache, + overBranchFileContents, ) where import qualified Data.ByteString as B @@ -42,6 +44,7 @@ import Data.Function import Data.Char import Data.ByteString.Builder import Control.Concurrent (threadDelay) +import Control.Concurrent.MVar import qualified System.FilePath.ByteString as P import Annex.Common @@ -66,6 +69,7 @@ import Annex.HashObject import Git.Types (Ref(..), fromRef, fromRef', RefDate, TreeItemType(..)) import Git.FilePath import Annex.CatFile +import Git.CatFile (catObjectStreamLsTree) import Annex.Perms import Logs import Logs.Transitions @@ -261,6 +265,18 @@ get file = getCache file >>= \case setCache file content return content +{- Used to cache the value of a file, which has been read from the branch + - using some optimised method. The journal has to be checked, in case + - it has a newer version of the file that has not reached the branch yet. + -} +precache :: RawFilePath -> L.ByteString -> Annex () +precache file branchcontent = do + st <- getState + content <- if journalIgnorable st + then pure branchcontent + else fromMaybe branchcontent <$> getJournalFileStale file + Annex.BranchState.setCache file content + {- Like get, but does not merge the branch, so the info returned may not - reflect changes in remotes. - (Changing the value this returns, and then merging is always the @@ -758,3 +774,59 @@ rememberTreeishLocked treeish graftpoint jl = do -- say that the index contains c'. setIndexSha c' +{- Runs an action on the content of selected files from the branch. + - This is much faster than reading the content of each file in turn, + - because it lets git cat-file stream content without blocking. + - + - The action is passed a callback that it can repeatedly call to read + - the next file and its contents. When there are no more files, the + - callback will return Nothing. + -} +overBranchFileContents + :: (RawFilePath -> Maybe v) + -> (Annex (Maybe (v, RawFilePath, Maybe L.ByteString)) -> Annex ()) + -> Annex () +overBranchFileContents select go = do + st <- update + g <- Annex.gitRepo + (l, cleanup) <- inRepo $ Git.LsTree.lsTree + Git.LsTree.LsTreeRecursive + (Git.LsTree.LsTreeLong False) + fullname + let select' f = fmap (\v -> (v, f)) (select f) + buf <- liftIO newEmptyMVar + let go' reader = go $ liftIO reader >>= \case + Just ((v, f), content) -> do + -- Check the journal if it did not get + -- committed to the branch + content' <- if journalIgnorable st + then pure content + else maybe content Just <$> getJournalFileStale f + return (Just (v, f, content')) + Nothing + | journalIgnorable st -> return Nothing + -- The journal did not get committed to the + -- branch, and may contain files that + -- are not present in the branch, which + -- need to be provided to the action still. + -- This can cause the action to be run a + -- second time with a file it already ran on. + | otherwise -> liftIO (tryTakeMVar buf) >>= \case + Nothing -> drain buf =<< getJournalledFilesStale + Just fs -> drain buf fs + catObjectStreamLsTree l (select' . getTopFilePath . Git.LsTree.file) g go' + liftIO $ void cleanup + where + getnext [] = Nothing + getnext (f:fs) = case select f of + Nothing -> getnext fs + Just v -> Just (v, f, fs) + + drain buf fs = case getnext fs of + Just (v, f, fs') -> do + liftIO $ putMVar buf fs' + content <- getJournalFileStale f + return (Just (v, f, content)) + Nothing -> do + liftIO $ putMVar buf [] + return Nothing diff --git a/CHANGELOG b/CHANGELOG index cdbf2c004f..a9793f09d0 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -12,6 +12,9 @@ git-annex (8.20210331) UNRELEASED; urgency=medium * directory: When cp supports reflinks, use it. * init: Fix a crash when the repo's was cloned from a repo that had an adjusted branch checked out, and the origin remote is not named "origin". + * Fix bug caused by recent optimisations that could make git-annex not + see recently recorded status information when configured with + annex.alwayscommit=false. -- Joey Hess Thu, 01 Apr 2021 12:17:26 -0400 diff --git a/CmdLine/Seek.hs b/CmdLine/Seek.hs index 65156772e3..096c588485 100644 --- a/CmdLine/Seek.hs +++ b/CmdLine/Seek.hs @@ -9,8 +9,6 @@ - Licensed under the GNU AGPL version 3 or higher. -} -{-# LANGUAGE TupleSections #-} - module CmdLine.Seek where import Annex.Common @@ -44,7 +42,6 @@ import Annex.Concurrent import Annex.CheckIgnore import Annex.Action import qualified Annex.Branch -import qualified Annex.BranchState import qualified Database.Keys import qualified Utility.RawFilePath as R import Utility.Tuple @@ -274,25 +271,18 @@ withKeyOptions' ko auto mkkeyaction fallbackaction worktreeitems = do checktimelimit <- mkCheckTimeLimit keyaction <- mkkeyaction config <- Annex.getGitConfig - g <- Annex.gitRepo - void Annex.Branch.update - (l, cleanup) <- inRepo $ LsTree.lsTree - LsTree.LsTreeRecursive - (LsTree.LsTreeLong False) - Annex.Branch.fullname - let getk f = fmap (,f) (locationLogFileKey config f) + let getk = locationLogFileKey config let discard reader = reader >>= \case Nothing -> noop Just _ -> discard reader - let go reader = liftIO reader >>= \case - Nothing -> return () - Just ((k, f), content) -> checktimelimit (discard reader) $ do - maybe noop (Annex.BranchState.setCache f) content + let go reader = reader >>= \case + Just (k, f, content) -> checktimelimit (discard reader) $ do + maybe noop (Annex.Branch.precache f) content keyaction Nothing (SeekInput [], k, mkActionItem k) go reader - catObjectStreamLsTree l (getk . getTopFilePath . LsTree.file) g go - liftIO $ void cleanup + Nothing -> return () + Annex.Branch.overBranchFileContents getk go runkeyaction getks = do keyaction <- mkkeyaction @@ -383,7 +373,7 @@ seekFilteredKeys seeker listfs = do liftIO $ void cleanup where finisher mi oreader checktimelimit = liftIO oreader >>= \case - Just ((si, f), content) -> checktimelimit discard $ do + Just ((si, f), content) -> checktimelimit (liftIO discard) $ do keyaction f mi content $ commandAction . startAction seeker si f finisher mi oreader checktimelimit @@ -394,8 +384,8 @@ seekFilteredKeys seeker listfs = do Just _ -> discard precachefinisher mi lreader checktimelimit = liftIO lreader >>= \case - Just ((logf, (si, f), k), logcontent) -> checktimelimit discard $ do - maybe noop (Annex.BranchState.setCache logf) logcontent + Just ((logf, (si, f), k), logcontent) -> checktimelimit (liftIO discard) $ do + maybe noop (Annex.Branch.precache logf) logcontent checkMatcherWhen mi (matcherNeedsLocationLog mi && not (matcherNeedsFileName mi)) (MatchingFile $ FileInfo f f (Just k)) @@ -601,9 +591,9 @@ notSymlink :: RawFilePath -> IO Bool notSymlink f = liftIO $ not . isSymbolicLink <$> R.getSymbolicLinkStatus f {- Returns an action that, when there's a time limit, can be used - - to check it before processing a file. The IO action is run when over the - - time limit. -} -mkCheckTimeLimit :: Annex (IO () -> Annex () -> Annex ()) + - to check it before processing a file. The first action is run when over the + - time limit, otherwise the second action is run. -} +mkCheckTimeLimit :: Annex (Annex () -> Annex () -> Annex ()) mkCheckTimeLimit = Annex.getState Annex.timelimit >>= \case Nothing -> return $ \_ a -> a Just (duration, cutoff) -> return $ \cleanup a -> do @@ -612,6 +602,6 @@ mkCheckTimeLimit = Annex.getState Annex.timelimit >>= \case then do warning $ "Time limit (" ++ fromDuration duration ++ ") reached! Shutting down..." shutdown True - liftIO cleanup + cleanup liftIO $ exitWith $ ExitFailure 101 else a diff --git a/doc/bugs/git-annex_branch_caching_bug.mdwn b/doc/bugs/git-annex_branch_caching_bug.mdwn new file mode 100644 index 0000000000..c1d7adaefd --- /dev/null +++ b/doc/bugs/git-annex_branch_caching_bug.mdwn @@ -0,0 +1,22 @@ +If the journal contains a newer version of a log file than the git-annex +branch, and annex.alwayscommit=false so the branch is not getting updated, +the value from the journal can be ignored when reading that log file. + +In CmdLine.Seek, there is some code that precaches location logs as an +optimisation (when using eg --copies). That streams info from the +git-annex branch into the cache. But it never checks for a journal file +with newer information. + +> fixed this + +Also in Cmdline.Seek, there is a LsTreeRecursive over the branch to handle +`--all`, and I think again that would mean it doesn't notice location +logs that are only in the journal. +Before that optimisation, it was using Logs.Location.loggedKeys, +which does look at the journal. + +> fixed + +(This is also a blocker for [[todo/hiding_a_repository]].) + +[[done]] --[[Joey]] diff --git a/doc/todo/hiding_a_repository.mdwn b/doc/todo/hiding_a_repository.mdwn index ab7e2560ad..b606d59be1 100644 --- a/doc/todo/hiding_a_repository.mdwn +++ b/doc/todo/hiding_a_repository.mdwn @@ -151,19 +151,9 @@ later write. > No way to configure what repo is hidden yet. --[[Joey]] > > Implementation notes: -> -> * CmdLine.Seek precaches git-annex branch -> location logs, but that does not include private ones. Since they're -> cached, the private ones don't get read. Result is eg, whereis finds no -> copies. Either need to disable CmdLine.Seek precaching when there's -> hidden repos, or could make the cache indicate it's only of public -> info, so private info still gets read. -> * CmdLine.Seek contains a LsTreeRecursive over the branch to handle -> --all, and again that won't see private information, including even -> annexed files that are only present in the hidden repo. -> * (And I wonder, don't both the caches above already miss things in -> the journal?) -> * Any other direct accesses of the branch, not going through +> +> * [[bugs/git-annex_branch_caching_bug]] was a problem, now fixed. +> * Any other similar direct accesses of the branch, not going through > Annex.Branch, also need to be fixed (and may be missing journal files > already?) Command.ImportFeed.knownItems is one. Command.Log behavior > needs to be investigated, may be ok. And Logs.Web.withKnownUrls is another.