From 9483b10469b075f498b9c6e13a976640799e8d76 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 7 Jul 2020 14:18:55 -0400 Subject: [PATCH] cache one more log file for metadata My worry was that a preferred content expression that matches on metadata would have removed the location log from cache, causing an expensive re-read when a Seek action later checked the location log. Especially when the --all optimisation in the previous commit pre-cached the location log. This also means that the --all optimisation could cache the metadata log too, if it wanted too, but not currently done. The cache is a list, with the most recently accessed file first. That optimises it for the common case of reading the same file twice, eg a get, examine, followed by set reads it twice. And sync --content reads the location log 3 times in a row commonly. But, as a list, it should not be made to be too long. I thought about expanding it to 5 items, but that seemed unlikely to be a win commonly enough to outweigh the extra time spent checking the cache. Clearly there could be some further benchmarking and tuning here. --- Annex/BranchState.hs | 26 ++++++++++++-------------- Logs.hs | 21 ++++++++++++++++++++- Types/BranchState.hs | 8 +++----- 3 files changed, 35 insertions(+), 20 deletions(-) diff --git a/Annex/BranchState.hs b/Annex/BranchState.hs index 38883ff86b..34f142a50c 100644 --- a/Annex/BranchState.hs +++ b/Annex/BranchState.hs @@ -12,6 +12,7 @@ module Annex.BranchState where import Annex.Common import Types.BranchState import qualified Annex +import Logs import qualified Data.ByteString.Lazy as L @@ -88,22 +89,19 @@ enableInteractiveBranchAccess = changeState $ setCache :: RawFilePath -> L.ByteString -> Annex () setCache file content = changeState $ \s -> s - { cachedFile = Just file - , cachedContent = content - } + { cachedFileContents = add (cachedFileContents s) } + where + add l + | length l < logFilesToCache = (file, content) : l + | otherwise = (file, content) : Prelude.init l getCache :: RawFilePath -> Annex (Maybe L.ByteString) -getCache file = go <$> getState +getCache file = (\st -> go (cachedFileContents st) st) <$> getState where - go state - | cachedFile state == Just file - && not (needInteractiveAccess state) = - Just (cachedContent state) - | otherwise = Nothing + go [] _ = Nothing + go ((f,c):rest) state + | f == file && not (needInteractiveAccess state) = Just c + | otherwise = go rest state invalidateCache :: Annex () -invalidateCache = changeState $ \s -> s - { cachedFile = Nothing - , cachedContent = mempty - } - +invalidateCache = changeState $ \s -> s { cachedFileContents = [] } diff --git a/Logs.hs b/Logs.hs index a475d59394..c7cec22272 100644 --- a/Logs.hs +++ b/Logs.hs @@ -38,6 +38,26 @@ getLogVariety config f | isMetaDataLog f || f `elem` otherLogs = Just OtherLog | otherwise = PresenceLog <$> firstJust (presenceLogs config f) +{- Typical number of log files that may be read while processing a single + - key. This is used to size a cache. + - + - The location log is generally read, and the metadata log is read when + - matching a preferred content expression that matches on metadata, + - or when using metadata options. + - + - When using a remote, the url log, chunk log, remote state log, remote + - metadata log, and remote content identifier log might each be used, + - but probably at most 3 out of the 6. However, caching too much slows + - down all operations because the cache is a linear list, so the cache + - is not currently sized to include these. + - + - The result is that when seeking for files to operate on, + - the location log will stay in the cache if the metadata log is also + - read. + -} +logFilesToCache :: Int +logFilesToCache = 2 + {- All the old-format uuid-based logs stored in the top of the git-annex branch. -} topLevelOldUUIDBasedLogs :: [RawFilePath] topLevelOldUUIDBasedLogs = @@ -59,7 +79,6 @@ topLevelNewUUIDBasedLogs = [ exportLog ] - {- All the ways to get a key from a presence log file -} presenceLogs :: GitConfig -> RawFilePath -> [Maybe Key] presenceLogs config f = diff --git a/Types/BranchState.hs b/Types/BranchState.hs index 86055cf583..93f8a2afc2 100644 --- a/Types/BranchState.hs +++ b/Types/BranchState.hs @@ -19,10 +19,8 @@ data BranchState = BranchState , journalIgnorable :: Bool -- ^ can reading the journal be skipped, while still getting -- sufficiently up-to-date information from the branch? - , cachedFile :: Maybe RawFilePath - -- ^ a file recently read from the branch - , cachedContent :: L.ByteString - -- ^ content of the cachedFile + , cachedFileContents :: [(RawFilePath, L.ByteString)] + -- ^ contents of a few files recently read from the branch , needInteractiveAccess :: Bool -- ^ do new changes written to the journal or branch by another -- process need to be noticed while the current process is running? @@ -31,4 +29,4 @@ data BranchState = BranchState } startBranchState :: BranchState -startBranchState = BranchState False False False Nothing mempty False +startBranchState = BranchState False False False [] False