git-annex/Logs.hs
Joey Hess 9483b10469
cache one more log file for metadata
My worry was that a preferred content expression that matches on metadata
would have removed the location log from cache, causing an expensive
re-read when a Seek action later checked the location log.

Especially when the --all optimisation in the previous commit
pre-cached the location log.

This also means that the --all optimisation could cache the metadata log
too, if it wanted too, but not currently done.

The cache is a list, with the most recently accessed file first. That
optimises it for the common case of reading the same file twice, eg a
get, examine, followed by set reads it twice. And sync --content reads the
location log 3 times in a row commonly.

But, as a list, it should not be made to be too long. I thought about
expanding it to 5 items, but that seemed unlikely to be a win commonly
enough to outweigh the extra time spent checking the cache.

Clearly there could be some further benchmarking and tuning here.
2020-07-07 14:18:55 -04:00

253 lines
7.4 KiB
Haskell

{- git-annex log file names
-
- Copyright 2013-2020 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE OverloadedStrings #-}
module Logs where
import Annex.Common
import Annex.DirHashes
import qualified Data.ByteString as S
import qualified System.FilePath.ByteString as P
{- There are several varieties of log file formats. -}
data LogVariety
= OldUUIDBasedLog
| NewUUIDBasedLog
| ChunkLog Key
| PresenceLog Key
| RemoteMetaDataLog
| OtherLog
deriving (Show)
{- Converts a path from the git-annex branch into one of the varieties
- of logs used by git-annex, if it's a known path. -}
getLogVariety :: GitConfig -> RawFilePath -> Maybe LogVariety
getLogVariety config f
| f `elem` topLevelOldUUIDBasedLogs = Just OldUUIDBasedLog
| f `elem` topLevelNewUUIDBasedLogs = Just NewUUIDBasedLog
| isRemoteStateLog f = Just NewUUIDBasedLog
| isRemoteContentIdentifierLog f = Just NewUUIDBasedLog
| isChunkLog f = ChunkLog <$> extLogFileKey chunkLogExt f
| isRemoteMetaDataLog f = Just RemoteMetaDataLog
| isMetaDataLog f || f `elem` otherLogs = Just OtherLog
| otherwise = PresenceLog <$> firstJust (presenceLogs config f)
{- Typical number of log files that may be read while processing a single
- key. This is used to size a cache.
-
- The location log is generally read, and the metadata log is read when
- matching a preferred content expression that matches on metadata,
- or when using metadata options.
-
- When using a remote, the url log, chunk log, remote state log, remote
- metadata log, and remote content identifier log might each be used,
- but probably at most 3 out of the 6. However, caching too much slows
- down all operations because the cache is a linear list, so the cache
- is not currently sized to include these.
-
- The result is that when seeking for files to operate on,
- the location log will stay in the cache if the metadata log is also
- read.
-}
logFilesToCache :: Int
logFilesToCache = 2
{- All the old-format uuid-based logs stored in the top of the git-annex branch. -}
topLevelOldUUIDBasedLogs :: [RawFilePath]
topLevelOldUUIDBasedLogs =
[ uuidLog
, remoteLog
, trustLog
, groupLog
, preferredContentLog
, requiredContentLog
, scheduleLog
, activityLog
, differenceLog
, multicastLog
]
{- All the new-format uuid-based logs stored in the top of the git-annex branch. -}
topLevelNewUUIDBasedLogs :: [RawFilePath]
topLevelNewUUIDBasedLogs =
[ exportLog
]
{- All the ways to get a key from a presence log file -}
presenceLogs :: GitConfig -> RawFilePath -> [Maybe Key]
presenceLogs config f =
[ urlLogFileKey f
, locationLogFileKey config f
]
{- Top-level logs that are neither UUID based nor presence logs. -}
otherLogs :: [RawFilePath]
otherLogs =
[ numcopiesLog
, groupPreferredContentLog
]
uuidLog :: RawFilePath
uuidLog = "uuid.log"
numcopiesLog :: RawFilePath
numcopiesLog = "numcopies.log"
configLog :: RawFilePath
configLog = "config.log"
remoteLog :: RawFilePath
remoteLog = "remote.log"
trustLog :: RawFilePath
trustLog = "trust.log"
groupLog :: RawFilePath
groupLog = "group.log"
preferredContentLog :: RawFilePath
preferredContentLog = "preferred-content.log"
requiredContentLog :: RawFilePath
requiredContentLog = "required-content.log"
groupPreferredContentLog :: RawFilePath
groupPreferredContentLog = "group-preferred-content.log"
scheduleLog :: RawFilePath
scheduleLog = "schedule.log"
activityLog :: RawFilePath
activityLog = "activity.log"
differenceLog :: RawFilePath
differenceLog = "difference.log"
multicastLog :: RawFilePath
multicastLog = "multicast.log"
exportLog :: RawFilePath
exportLog = "export.log"
{- The pathname of the location log file for a given key. -}
locationLogFile :: GitConfig -> Key -> RawFilePath
locationLogFile config key =
branchHashDir config key P.</> keyFile key <> ".log"
{- The filename of the url log for a given key. -}
urlLogFile :: GitConfig -> Key -> RawFilePath
urlLogFile config key =
branchHashDir config key P.</> keyFile key <> urlLogExt
{- Old versions stored the urls elsewhere. -}
oldurlLogs :: GitConfig -> Key -> [RawFilePath]
oldurlLogs config key =
[ "remote/web" P.</> hdir P.</> serializeKey' key <> ".log"
, "remote/web" P.</> hdir P.</> keyFile key <> ".log"
]
where
hdir = branchHashDir config key
urlLogExt :: S.ByteString
urlLogExt = ".log.web"
{- Does not work on oldurllogs. -}
isUrlLog :: RawFilePath -> Bool
isUrlLog file = urlLogExt `S.isSuffixOf` file
{- The filename of the remote state log for a given key. -}
remoteStateLogFile :: GitConfig -> Key -> RawFilePath
remoteStateLogFile config key =
(branchHashDir config key P.</> keyFile key)
<> remoteStateLogExt
remoteStateLogExt :: S.ByteString
remoteStateLogExt = ".log.rmt"
isRemoteStateLog :: RawFilePath -> Bool
isRemoteStateLog path = remoteStateLogExt `S.isSuffixOf` path
{- The filename of the chunk log for a given key. -}
chunkLogFile :: GitConfig -> Key -> RawFilePath
chunkLogFile config key =
(branchHashDir config key P.</> keyFile key)
<> chunkLogExt
chunkLogExt :: S.ByteString
chunkLogExt = ".log.cnk"
isChunkLog :: RawFilePath -> Bool
isChunkLog path = chunkLogExt `S.isSuffixOf` path
{- The filename of the metadata log for a given key. -}
metaDataLogFile :: GitConfig -> Key -> RawFilePath
metaDataLogFile config key =
(branchHashDir config key P.</> keyFile key)
<> metaDataLogExt
metaDataLogExt :: S.ByteString
metaDataLogExt = ".log.met"
isMetaDataLog :: RawFilePath -> Bool
isMetaDataLog path = metaDataLogExt `S.isSuffixOf` path
{- The filename of the remote metadata log for a given key. -}
remoteMetaDataLogFile :: GitConfig -> Key -> RawFilePath
remoteMetaDataLogFile config key =
(branchHashDir config key P.</> keyFile key)
<> remoteMetaDataLogExt
remoteMetaDataLogExt :: S.ByteString
remoteMetaDataLogExt = ".log.rmet"
isRemoteMetaDataLog :: RawFilePath -> Bool
isRemoteMetaDataLog path = remoteMetaDataLogExt `S.isSuffixOf` path
{- The filename of the remote content identifier log for a given key. -}
remoteContentIdentifierLogFile :: GitConfig -> Key -> RawFilePath
remoteContentIdentifierLogFile config key =
(branchHashDir config key P.</> keyFile key)
<> remoteContentIdentifierExt
remoteContentIdentifierExt :: S.ByteString
remoteContentIdentifierExt = ".log.cid"
isRemoteContentIdentifierLog :: RawFilePath -> Bool
isRemoteContentIdentifierLog path = remoteContentIdentifierExt `S.isSuffixOf` path
{- From an extension and a log filename, get the key that it's a log for. -}
extLogFileKey :: S.ByteString -> RawFilePath -> Maybe Key
extLogFileKey expectedext path
| ext == expectedext = fileKey base
| otherwise = Nothing
where
file = P.takeFileName path
(base, ext) = S.splitAt (S.length file - extlen) file
extlen = S.length expectedext
{- Converts a url log file into a key.
- (Does not work on oldurlLogs.) -}
urlLogFileKey :: RawFilePath -> Maybe Key
urlLogFileKey = extLogFileKey urlLogExt
{- Converts a pathname into a key if it's a location log. -}
locationLogFileKey :: GitConfig -> RawFilePath -> Maybe Key
locationLogFileKey config path
| length (splitDirectories (fromRawFilePath path)) /= locationLogFileDepth config = Nothing
| otherwise = extLogFileKey ".log" path
{- Depth of location log files within the git-annex branch.
-
- Normally they are xx/yy/key.log so depth 3.
- The same extension is also used for other logs that
- are not location logs. -}
locationLogFileDepth :: GitConfig -> Int
locationLogFileDepth config = hashlevels + 1
where
HashLevels hashlevels = branchHashLevels config