cache one more log file for metadata

My worry was that a preferred content expression that matches on metadata
would have removed the location log from cache, causing an expensive
re-read when a Seek action later checked the location log.

Especially when the --all optimisation in the previous commit
pre-cached the location log.

This also means that the --all optimisation could cache the metadata log
too, if it wanted too, but not currently done.

The cache is a list, with the most recently accessed file first. That
optimises it for the common case of reading the same file twice, eg a
get, examine, followed by set reads it twice. And sync --content reads the
location log 3 times in a row commonly.

But, as a list, it should not be made to be too long. I thought about
expanding it to 5 items, but that seemed unlikely to be a win commonly
enough to outweigh the extra time spent checking the cache.

Clearly there could be some further benchmarking and tuning here.
This commit is contained in:
Joey Hess 2020-07-07 14:18:55 -04:00
parent d010ab04be
commit 9483b10469
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 35 additions and 20 deletions

View file

@ -12,6 +12,7 @@ module Annex.BranchState where
import Annex.Common
import Types.BranchState
import qualified Annex
import Logs
import qualified Data.ByteString.Lazy as L
@ -88,22 +89,19 @@ enableInteractiveBranchAccess = changeState $
setCache :: RawFilePath -> L.ByteString -> Annex ()
setCache file content = changeState $ \s -> s
{ cachedFile = Just file
, cachedContent = content
}
{ cachedFileContents = add (cachedFileContents s) }
where
add l
| length l < logFilesToCache = (file, content) : l
| otherwise = (file, content) : Prelude.init l
getCache :: RawFilePath -> Annex (Maybe L.ByteString)
getCache file = go <$> getState
getCache file = (\st -> go (cachedFileContents st) st) <$> getState
where
go state
| cachedFile state == Just file
&& not (needInteractiveAccess state) =
Just (cachedContent state)
| otherwise = Nothing
go [] _ = Nothing
go ((f,c):rest) state
| f == file && not (needInteractiveAccess state) = Just c
| otherwise = go rest state
invalidateCache :: Annex ()
invalidateCache = changeState $ \s -> s
{ cachedFile = Nothing
, cachedContent = mempty
}
invalidateCache = changeState $ \s -> s { cachedFileContents = [] }

21
Logs.hs
View file

@ -38,6 +38,26 @@ getLogVariety config f
| isMetaDataLog f || f `elem` otherLogs = Just OtherLog
| otherwise = PresenceLog <$> firstJust (presenceLogs config f)
{- Typical number of log files that may be read while processing a single
- key. This is used to size a cache.
-
- The location log is generally read, and the metadata log is read when
- matching a preferred content expression that matches on metadata,
- or when using metadata options.
-
- When using a remote, the url log, chunk log, remote state log, remote
- metadata log, and remote content identifier log might each be used,
- but probably at most 3 out of the 6. However, caching too much slows
- down all operations because the cache is a linear list, so the cache
- is not currently sized to include these.
-
- The result is that when seeking for files to operate on,
- the location log will stay in the cache if the metadata log is also
- read.
-}
logFilesToCache :: Int
logFilesToCache = 2
{- All the old-format uuid-based logs stored in the top of the git-annex branch. -}
topLevelOldUUIDBasedLogs :: [RawFilePath]
topLevelOldUUIDBasedLogs =
@ -59,7 +79,6 @@ topLevelNewUUIDBasedLogs =
[ exportLog
]
{- All the ways to get a key from a presence log file -}
presenceLogs :: GitConfig -> RawFilePath -> [Maybe Key]
presenceLogs config f =

View file

@ -19,10 +19,8 @@ data BranchState = BranchState
, journalIgnorable :: Bool
-- ^ can reading the journal be skipped, while still getting
-- sufficiently up-to-date information from the branch?
, cachedFile :: Maybe RawFilePath
-- ^ a file recently read from the branch
, cachedContent :: L.ByteString
-- ^ content of the cachedFile
, cachedFileContents :: [(RawFilePath, L.ByteString)]
-- ^ contents of a few files recently read from the branch
, needInteractiveAccess :: Bool
-- ^ do new changes written to the journal or branch by another
-- process need to be noticed while the current process is running?
@ -31,4 +29,4 @@ data BranchState = BranchState
}
startBranchState :: BranchState
startBranchState = BranchState False False False Nothing mempty False
startBranchState = BranchState False False False [] False