add back git-annex branch read cache

The cache was removed way back in 2012,
commit 3417c55189

Then I forgot I had removed it! I remember clearly multiple times when I
thought, "this reads the same data twice, but the cache will avoid that
being very expensive".

The reason it was removed was it messed up the assistant noticing when
other processes made changes. That same kind of problem has recently
been addressed when adding the optimisation to avoid reading the journal
unnecessarily.

Indeed, enableInteractiveJournalAccess is run in just the
right places, so can just piggyback on it to know when it's not safe
to use the cache.
This commit is contained in:
Joey Hess 2020-07-06 12:09:53 -04:00
parent 9a2fbc2ea8
commit e72ec8b9b2
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
8 changed files with 68 additions and 19 deletions

View file

@ -230,6 +230,7 @@ updateTo' pairs = do
else commitIndex jl branchref merge_desc commitrefs else commitIndex jl branchref merge_desc commitrefs
) )
addMergedRefs tomerge addMergedRefs tomerge
invalidateCache
stagejournalwhen dirty jl a stagejournalwhen dirty jl a
| dirty = stageJournal jl a | dirty = stageJournal jl a
| otherwise = withIndex a | otherwise = withIndex a
@ -242,11 +243,15 @@ updateTo' pairs = do
- -
- Returns an empty string if the file doesn't exist yet. -} - Returns an empty string if the file doesn't exist yet. -}
get :: RawFilePath -> Annex L.ByteString get :: RawFilePath -> Annex L.ByteString
get file = do get file = getCache file >>= \case
Just content -> return content
Nothing -> do
st <- update st <- update
if journalIgnorable st content <- if journalIgnorable st
then getRef fullname file then getRef fullname file
else getLocal file else getLocal file
setCache file content
return content
{- Like get, but does not merge the branch, so the info returned may not {- Like get, but does not merge the branch, so the info returned may not
- reflect changes in remotes. - reflect changes in remotes.
@ -301,6 +306,11 @@ set :: Journalable content => JournalLocked -> RawFilePath -> content -> Annex (
set jl f c = do set jl f c = do
journalChanged journalChanged
setJournalFile jl f c setJournalFile jl f c
-- Could cache the new content, but it would involve
-- evaluating a Journalable Builder twice, which is not very
-- efficient. Instead, assume that it's not common to need to read
-- a log file immediately after writing it.
invalidateCache
{- Commit message used when making a commit of whatever data has changed {- Commit message used when making a commit of whatever data has changed
- to the git-annex brach. -} - to the git-annex brach. -}

View file

@ -1,6 +1,6 @@
{- git-annex branch state management {- git-annex branch state management
- -
- Runtime state about the git-annex branch. - Runtime state about the git-annex branch, and a small cache.
- -
- Copyright 2011-2020 Joey Hess <id@joeyh.name> - Copyright 2011-2020 Joey Hess <id@joeyh.name>
- -
@ -13,6 +13,8 @@ import Annex.Common
import Types.BranchState import Types.BranchState
import qualified Annex import qualified Annex
import qualified Data.ByteString.Lazy as L
getState :: Annex BranchState getState :: Annex BranchState
getState = Annex.getState Annex.branchstate getState = Annex.getState Annex.branchstate
@ -45,7 +47,7 @@ runUpdateOnce a = do
let stf = \st' -> st' let stf = \st' -> st'
{ branchUpdated = True { branchUpdated = True
, journalIgnorable = journalstaged , journalIgnorable = journalstaged
&& not (journalNeverIgnorable st') && not (needInteractiveAccess st')
} }
changeState stf changeState stf
return (stf st) return (stf st)
@ -76,7 +78,32 @@ journalChanged = do
- and needs to always notice changes made to the journal by other - and needs to always notice changes made to the journal by other
- processes, this disables optimisations that avoid normally reading the - processes, this disables optimisations that avoid normally reading the
- journal. - journal.
-
- It also avoids using the cache, so changes committed by other processes
- will be seen.
-} -}
enableInteractiveJournalAccess :: Annex () enableInteractiveBranchAccess :: Annex ()
enableInteractiveJournalAccess = changeState $ enableInteractiveBranchAccess = changeState $
\s -> s { journalNeverIgnorable = True } \s -> s { needInteractiveAccess = True }
setCache :: RawFilePath -> L.ByteString -> Annex ()
setCache file content = changeState $ \s -> s
{ cachedFile = Just file
, cachedContent = content
}
getCache :: RawFilePath -> Annex (Maybe L.ByteString)
getCache file = go <$> getState
where
go state
| cachedFile state == Just file
&& not (needInteractiveAccess state) =
Just (cachedContent state)
| otherwise = Nothing
invalidateCache :: Annex ()
invalidateCache = changeState $ \s -> s
{ cachedFile = Nothing
, cachedContent = mempty
}

View file

@ -72,7 +72,7 @@ stopDaemon = liftIO . Utility.Daemon.stopDaemon =<< fromRepo gitAnnexPidFile
startDaemon :: Bool -> Bool -> Maybe Duration -> Maybe String -> Maybe HostName -> Maybe (Maybe Handle -> Maybe Handle -> String -> FilePath -> IO ()) -> Annex () startDaemon :: Bool -> Bool -> Maybe Duration -> Maybe String -> Maybe HostName -> Maybe (Maybe Handle -> Maybe Handle -> String -> FilePath -> IO ()) -> Annex ()
startDaemon assistant foreground startdelay cannotrun listenhost startbrowser = do startDaemon assistant foreground startdelay cannotrun listenhost startbrowser = do
Annex.changeState $ \s -> s { Annex.daemon = True } Annex.changeState $ \s -> s { Annex.daemon = True }
enableInteractiveJournalAccess enableInteractiveBranchAccess
pidfile <- fromRepo gitAnnexPidFile pidfile <- fromRepo gitAnnexPidFile
logfile <- fromRepo gitAnnexLogFile logfile <- fromRepo gitAnnexLogFile
liftIO $ debugM desc $ "logging to " ++ logfile liftIO $ debugM desc $ "logging to " ++ logfile

View file

@ -23,6 +23,8 @@ git-annex (8.20200618) UNRELEASED; urgency=medium
S3 (with versioning=yes), git-lfs, tahoe S3 (with versioning=yes), git-lfs, tahoe
* Fix reversion that broke passing annex.* and remote.*.annex-* * Fix reversion that broke passing annex.* and remote.*.annex-*
git configs with -c. (Since version 8.20200330.) git configs with -c. (Since version 8.20200330.)
* Bring back git-annex branch read cache. This speeds up some operations;
git-annex sync --content gets up to 3x faster.
-- Joey Hess <id@joeyh.name> Thu, 18 Jun 2020 12:21:14 -0400 -- Joey Hess <id@joeyh.name> Thu, 18 Jun 2020 12:21:14 -0400

View file

@ -80,7 +80,7 @@ batchInput fmt parser a = go =<< batchLines fmt
batchLines :: BatchFormat -> Annex [String] batchLines :: BatchFormat -> Annex [String]
batchLines fmt = do batchLines fmt = do
enableInteractiveJournalAccess enableInteractiveBranchAccess
liftIO $ splitter <$> getContents liftIO $ splitter <$> getContents
where where
splitter = case fmt of splitter = case fmt of

View file

@ -30,7 +30,7 @@ seek = withNothing (commandAction start)
start :: CommandStart start :: CommandStart
start = do start = do
enableInteractiveJournalAccess enableInteractiveBranchAccess
(readh, writeh) <- liftIO dupIoHandles (readh, writeh) <- liftIO dupIoHandles
runRequests readh writeh runner runRequests readh writeh runner
stop stop

View file

@ -166,7 +166,7 @@ genTransportHandle = do
let h = TransportHandle (LocalRepo g) annexstate let h = TransportHandle (LocalRepo g) annexstate
liftAnnex h $ do liftAnnex h $ do
Annex.setOutput QuietOutput Annex.setOutput QuietOutput
enableInteractiveJournalAccess enableInteractiveBranchAccess
return h return h
updateTransportHandle :: TransportHandle -> IO TransportHandle updateTransportHandle :: TransportHandle -> IO TransportHandle

View file

@ -1,12 +1,16 @@
{- git-annex BranchState data type {- git-annex BranchState data type
- -
- Copyright 2011 Joey Hess <id@joeyh.name> - Copyright 2011-2020 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
module Types.BranchState where module Types.BranchState where
import Common
import qualified Data.ByteString.Lazy as L
data BranchState = BranchState data BranchState = BranchState
{ branchUpdated :: Bool { branchUpdated :: Bool
-- ^ has the branch been updated this run? -- ^ has the branch been updated this run?
@ -15,10 +19,16 @@ data BranchState = BranchState
, journalIgnorable :: Bool , journalIgnorable :: Bool
-- ^ can reading the journal be skipped, while still getting -- ^ can reading the journal be skipped, while still getting
-- sufficiently up-to-date information from the branch? -- sufficiently up-to-date information from the branch?
, journalNeverIgnorable :: Bool , cachedFile :: Maybe RawFilePath
-- ^ should the journal always be read even if it would normally -- ^ a file recently read from the branch
-- be safe to skip it? , cachedContent :: L.ByteString
-- ^ content of the cachedFile
, needInteractiveAccess :: Bool
-- ^ do new changes written to the journal or branch by another
-- process need to be noticed while the current process is running?
-- (This makes the journal always be read, and avoids using the
-- cache.)
} }
startBranchState :: BranchState startBranchState :: BranchState
startBranchState = BranchState False False False False startBranchState = BranchState False False False Nothing mempty False