seek: defer matcher check until more info is known

Sped up seeking for files to operate on, when using options like --copies
or --in, by around 20%.

Benchmark showed an increase for --copies from 155 seconds to 121
seconds, and --in remote will be similar to that.

For --in here, the speedup was less, 5-10% or so.

(both warm cache)

This commit was sponsored by Jack Hill on Patreon.
This commit is contained in:
Joey Hess 2020-09-24 17:59:05 -04:00
parent c2d1d4e16e
commit ace02f41b0
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
7 changed files with 149 additions and 66 deletions

View file

@ -269,5 +269,6 @@ call (Right sub) = Right $ Operation $ MatchFiles
matchMrun sub $ \o -> matchAction o notpresent mi matchMrun sub $ \o -> matchAction o notpresent mi
, matchNeedsFileName = any matchNeedsFileName sub , matchNeedsFileName = any matchNeedsFileName sub
, matchNeedsFileContent = any matchNeedsFileContent sub , matchNeedsFileContent = any matchNeedsFileContent sub
, matchNeedsKey = any matchNeedsKey sub
} }
call (Left err) = Left err call (Left err) = Left err

View file

@ -21,6 +21,8 @@ git-annex (8.20200909) UNRELEASED; urgency=medium
* sync --all: Sped up seeking to around twice as fast, by avoiding a * sync --all: Sped up seeking to around twice as fast, by avoiding a
pass over the worktree files when preferred content expressions of the pass over the worktree files when preferred content expressions of the
local repo and remotes don't use include=/exclude=. local repo and remotes don't use include=/exclude=.
* Sped up seeking for files to operate on, when using options like
--copies or --in, by around 20%
-- Joey Hess <id@joeyh.name> Mon, 14 Sep 2020 18:34:37 -0400 -- Joey Hess <id@joeyh.name> Mon, 14 Sep 2020 18:34:37 -0400

View file

@ -278,6 +278,18 @@ seekFiltered a fs = do
process matcher v@(_si, f) = process matcher v@(_si, f) =
whenM (matcher $ MatchingFile $ FileInfo f f) (a v) whenM (matcher $ MatchingFile $ FileInfo f f) (a v)
data MatcherInfo = MatcherInfo
{ matcherAction :: MatchInfo -> Annex Bool
, matcherNeedsFileName :: Bool
, matcherNeedsKey :: Bool
, matcherNeedsLocationLog :: Bool
}
checkMatcherWhen :: MatcherInfo -> Bool -> MatchInfo -> Annex () -> Annex ()
checkMatcherWhen mi c i a
| c = whenM (matcherAction mi i) a
| otherwise = a
-- This is significantly faster than using lookupKey after seekFiltered, -- This is significantly faster than using lookupKey after seekFiltered,
-- because of the way data is streamed through git cat-file. -- because of the way data is streamed through git cat-file.
-- --
@ -285,7 +297,11 @@ seekFiltered a fs = do
seekFilteredKeys :: AnnexedFileSeeker -> Annex [(SeekInput, (RawFilePath, Git.Sha, FileMode))] -> Annex () seekFilteredKeys :: AnnexedFileSeeker -> Annex [(SeekInput, (RawFilePath, Git.Sha, FileMode))] -> Annex ()
seekFilteredKeys seeker listfs = do seekFilteredKeys seeker listfs = do
g <- Annex.gitRepo g <- Annex.gitRepo
matcher <- Limit.getMatcher mi <- MatcherInfo
<$> Limit.getMatcher
<*> Limit.introspect matchNeedsFileName
<*> Limit.introspect matchNeedsKey
<*> Limit.introspect matchNeedsLocationLog
config <- Annex.getGitConfig config <- Annex.getGitConfig
-- Run here, not in the async, because it could throw an exception -- Run here, not in the async, because it could throw an exception
-- The list should be built lazily. -- The list should be built lazily.
@ -293,93 +309,104 @@ seekFilteredKeys seeker listfs = do
catObjectMetaDataStream g $ \mdfeeder mdcloser mdreader -> catObjectMetaDataStream g $ \mdfeeder mdcloser mdreader ->
catObjectStream g $ \ofeeder ocloser oreader -> do catObjectStream g $ \ofeeder ocloser oreader -> do
processertid <- liftIO . async =<< forkState processertid <- liftIO . async =<< forkState
(process matcher ofeeder mdfeeder mdcloser False l) (process mi ofeeder mdfeeder mdcloser False l)
mdprocessertid <- liftIO . async =<< forkState mdprocessertid <- liftIO . async =<< forkState
(mdprocess matcher mdreader ofeeder ocloser) (mdprocess mi mdreader ofeeder ocloser)
if usesLocationLog seeker if usesLocationLog seeker || matcherNeedsLocationLog mi
then catObjectStream g $ \lfeeder lcloser lreader -> do then catObjectStream g $ \lfeeder lcloser lreader -> do
precachertid <- liftIO . async =<< forkState precachertid <- liftIO . async =<< forkState
(precacher config oreader lfeeder lcloser) (precacher mi config oreader lfeeder lcloser)
precachefinisher lreader precachefinisher mi lreader
join (liftIO (wait precachertid)) join (liftIO (wait precachertid))
else finisher oreader else finisher mi oreader
join (liftIO (wait mdprocessertid)) join (liftIO (wait mdprocessertid))
join (liftIO (wait processertid)) join (liftIO (wait processertid))
where where
finisher mi oreader = liftIO oreader >>= \case
Just ((si, f), content) -> do
keyaction f mi content $
commandAction . startAction seeker si f
finisher mi oreader
Nothing -> return ()
precachefinisher mi lreader = liftIO lreader >>= \case
Just ((logf, (si, f), k), logcontent) -> do
maybe noop (Annex.BranchState.setCache logf) logcontent
checkMatcherWhen mi
(matcherNeedsLocationLog mi && not (matcherNeedsFileName mi))
(MatchingKey k (AssociatedFile (Just f)))
(commandAction $ startAction seeker si f k)
precachefinisher mi lreader
Nothing -> return ()
precacher mi config oreader lfeeder lcloser = liftIO oreader >>= \case
Just ((si, f), content) -> do
keyaction f mi content $ \k ->
let logf = locationLogFile config k
ref = Git.Ref.branchFileRef Annex.Branch.fullname logf
in liftIO $ lfeeder ((logf, (si, f), k), ref)
precacher mi config oreader lfeeder lcloser
Nothing -> liftIO $ void lcloser
feedmatches mi ofeeder si f sha = checkMatcherWhen mi
-- When the matcher needs a key or location log
-- (and does not need a worktree filename), it will be
-- checked later, to avoid a slow lookup here.
(not ((matcherNeedsKey mi || matcherNeedsLocationLog mi)
&& not (matcherNeedsFileName mi)))
(MatchingFile $ FileInfo f f)
(liftIO $ ofeeder ((si, f), sha))
keyaction f mi content a =
case parseLinkTargetOrPointerLazy =<< content of
Just k -> checkMatcherWhen mi
(matcherNeedsKey mi && not (matcherNeedsFileName mi || matcherNeedsLocationLog mi))
(MatchingKey k (AssociatedFile (Just f)))
(checkpresence k (a k))
Nothing -> noop
checkpresence k cont = case checkContentPresent seeker of checkpresence k cont = case checkContentPresent seeker of
Just v -> do Just v -> do
present <- inAnnex k present <- inAnnex k
when (present == v) cont when (present == v) cont
Nothing -> cont Nothing -> cont
finisher oreader = liftIO oreader >>= \case process mi ofeeder mdfeeder mdcloser seenpointer ((si, (f, sha, mode)):rest) =
Just ((si, f), content) -> do
case parseLinkTargetOrPointerLazy =<< content of
Just k -> checkpresence k $
commandAction $
startAction seeker si f k
Nothing -> noop
finisher oreader
Nothing -> return ()
precachefinisher lreader = liftIO lreader >>= \case
Just ((logf, (si, f), k), logcontent) -> do
maybe noop (Annex.BranchState.setCache logf) logcontent
commandAction $ startAction seeker si f k
precachefinisher lreader
Nothing -> return ()
precacher config oreader lfeeder lcloser = liftIO oreader >>= \case
Just ((si, f), content) -> do
case parseLinkTargetOrPointerLazy =<< content of
Just k -> checkpresence k $
let logf = locationLogFile config k
ref = Git.Ref.branchFileRef Annex.Branch.fullname logf
in liftIO $ lfeeder ((logf, (si, f), k), ref)
Nothing -> noop
precacher config oreader lfeeder lcloser
Nothing -> liftIO $ void lcloser
feedmatches matcher ofeeder si f sha =
whenM (matcher $ MatchingFile $ FileInfo f f) $
liftIO $ ofeeder ((si, f), sha)
process matcher ofeeder mdfeeder mdcloser seenpointer ((si, (f, sha, mode)):rest) =
case Git.toTreeItemType mode of case Git.toTreeItemType mode of
Just Git.TreeSymlink -> do Just Git.TreeSymlink -> do
whenM (exists f) $ whenM (exists f) $
-- Once a pointer file has been seen, -- Once a pointer file has been seen,
-- symlinks have to be sent via the -- symlinks have to be sent via the
-- metadata processor too. That is slightly -- metadata processor too. That is
-- slower, but preserves the requested -- slightly slower, but preserves the
-- file order. -- requested file order.
if seenpointer if seenpointer
then liftIO $ mdfeeder ((si, f), sha) then liftIO $ mdfeeder ((si, f), sha)
else feedmatches matcher ofeeder si f sha else feedmatches mi ofeeder si f sha
process matcher ofeeder mdfeeder mdcloser seenpointer rest process mi ofeeder mdfeeder mdcloser seenpointer rest
Just Git.TreeSubmodule -> Just Git.TreeSubmodule ->
process matcher ofeeder mdfeeder mdcloser seenpointer rest process mi ofeeder mdfeeder mdcloser seenpointer rest
-- Might be a pointer file, might be other -- Might be a pointer file, might be other
-- file in git, possibly large. Avoid catting -- file in git, possibly large. Avoid catting
-- large files by first looking up the size. -- large files by first looking up the size.
Just _ -> do Just _ -> do
whenM (exists f) $ whenM (exists f) $
liftIO $ mdfeeder ((si, f), sha) liftIO $ mdfeeder ((si, f), sha)
process matcher ofeeder mdfeeder mdcloser True rest process mi ofeeder mdfeeder mdcloser True rest
Nothing -> Nothing ->
process matcher ofeeder mdfeeder mdcloser seenpointer rest process mi ofeeder mdfeeder mdcloser seenpointer rest
process _ _ _ mdcloser _ [] = liftIO $ void mdcloser process _ _ _ mdcloser _ [] = liftIO $ void mdcloser
-- Check if files exist, because a deleted file will still be -- Check if files exist, because a deleted file will still be
-- listed by ls-tree, but should not be processed. -- listed by ls-tree, but should not be processed.
exists p = isJust <$> liftIO (catchMaybeIO $ R.getSymbolicLinkStatus p) exists p = isJust <$> liftIO (catchMaybeIO $ R.getSymbolicLinkStatus p)
mdprocess matcher mdreader ofeeder ocloser = liftIO mdreader >>= \case mdprocess mi mdreader ofeeder ocloser = liftIO mdreader >>= \case
Just ((si, f), Just (sha, size, _type)) Just ((si, f), Just (sha, size, _type))
| size < maxPointerSz -> do | size < maxPointerSz -> do
feedmatches matcher ofeeder si f sha feedmatches mi ofeeder si f sha
mdprocess matcher mdreader ofeeder ocloser mdprocess mi mdreader ofeeder ocloser
Just _ -> mdprocess matcher mdreader ofeeder ocloser Just _ -> mdprocess mi mdreader ofeeder ocloser
Nothing -> liftIO $ void ocloser Nothing -> liftIO $ void ocloser
seekHelper :: (a -> RawFilePath) -> WarnUnmatchWhen -> ([LsFiles.Options] -> [RawFilePath] -> Git.Repo -> IO ([a], IO Bool)) -> WorkTreeItems -> Annex [(SeekInput, a)] seekHelper :: (a -> RawFilePath) -> WarnUnmatchWhen -> ([LsFiles.Options] -> [RawFilePath] -> Git.Repo -> IO ([a], IO Bool)) -> WorkTreeItems -> Annex [(SeekInput, a)]

View file

@ -95,6 +95,8 @@ limitInclude glob = Right $ MatchFiles
{ matchAction = const $ matchGlobFile glob { matchAction = const $ matchGlobFile glob
, matchNeedsFileName = True , matchNeedsFileName = True
, matchNeedsFileContent = False , matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
} }
{- Add a limit to skip files that match the glob. -} {- Add a limit to skip files that match the glob. -}
@ -106,6 +108,8 @@ limitExclude glob = Right $ MatchFiles
{ matchAction = const $ not <$$> matchGlobFile glob { matchAction = const $ not <$$> matchGlobFile glob
, matchNeedsFileName = True , matchNeedsFileName = True
, matchNeedsFileContent = False , matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
} }
matchGlobFile :: String -> MatchInfo -> Annex Bool matchGlobFile :: String -> MatchInfo -> Annex Bool
@ -145,6 +149,8 @@ matchMagic _limitname querymagic selectprovidedinfo (Just magic) glob =
{ matchAction = const go { matchAction = const go
, matchNeedsFileName = True , matchNeedsFileName = True
, matchNeedsFileContent = True , matchNeedsFileContent = True
, matchNeedsKey = False
, matchNeedsLocationLog = False
} }
where where
cglob = compileGlob glob CaseSensative -- memoized cglob = compileGlob glob CaseSensative -- memoized
@ -162,6 +168,8 @@ addUnlocked = addLimit $ Right $ MatchFiles
{ matchAction = const $ matchLockStatus False { matchAction = const $ matchLockStatus False
, matchNeedsFileName = True , matchNeedsFileName = True
, matchNeedsFileContent = False , matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
} }
addLocked :: Annex () addLocked :: Annex ()
@ -169,6 +177,8 @@ addLocked = addLimit $ Right $ MatchFiles
{ matchAction = const $ matchLockStatus True { matchAction = const $ matchLockStatus True
, matchNeedsFileName = True , matchNeedsFileName = True
, matchNeedsFileContent = False , matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
} }
matchLockStatus :: Bool -> MatchInfo -> Annex Bool matchLockStatus :: Bool -> MatchInfo -> Annex Bool
@ -188,14 +198,16 @@ addIn s = do
u <- Remote.nameToUUID name u <- Remote.nameToUUID name
hereu <- getUUID hereu <- getUUID
addLimit $ if u == hereu && null date addLimit $ if u == hereu && null date
then use inhere then use True inhere
else use (inuuid u) else use False (inuuid u)
where where
(name, date) = separate (== '@') s (name, date) = separate (== '@') s
use a = Right $ MatchFiles use inhere a = Right $ MatchFiles
{ matchAction = checkKey . a { matchAction = checkKey . a
, matchNeedsFileName = False , matchNeedsFileName = False
, matchNeedsFileContent = False , matchNeedsFileContent = False
, matchNeedsKey = True
, matchNeedsLocationLog = not inhere
} }
inuuid u notpresent key inuuid u notpresent key
| null date = do | null date = do
@ -224,6 +236,8 @@ limitPresent u = MatchFiles
return $ maybe False (`elem` us) u return $ maybe False (`elem` us) u
, matchNeedsFileName = False , matchNeedsFileName = False
, matchNeedsFileContent = False , matchNeedsFileContent = False
, matchNeedsKey = True
, matchNeedsLocationLog = not (isNothing u)
} }
{- Limit to content that is in a directory, anywhere in the repository tree -} {- Limit to content that is in a directory, anywhere in the repository tree -}
@ -232,6 +246,8 @@ limitInDir dir = MatchFiles
{ matchAction = const go { matchAction = const go
, matchNeedsFileName = True , matchNeedsFileName = True
, matchNeedsFileContent = False , matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
} }
where where
go (MatchingFile fi) = checkf $ fromRawFilePath $ matchFile fi go (MatchingFile fi) = checkf $ fromRawFilePath $ matchFile fi
@ -262,6 +278,8 @@ limitCopies want = case splitc ':' want of
go' n good notpresent go' n good notpresent
, matchNeedsFileName = False , matchNeedsFileName = False
, matchNeedsFileContent = False , matchNeedsFileContent = False
, matchNeedsKey = True
, matchNeedsLocationLog = True
} }
go' n good notpresent key = do go' n good notpresent key = do
us <- filter (`S.notMember` notpresent) us <- filter (`S.notMember` notpresent)
@ -284,6 +302,8 @@ limitLackingCopies approx want = case readish want of
go mi needed notpresent go mi needed notpresent
, matchNeedsFileName = False , matchNeedsFileName = False
, matchNeedsFileContent = False , matchNeedsFileContent = False
, matchNeedsKey = True
, matchNeedsLocationLog = True
} }
Nothing -> Left "bad value for number of lacking copies" Nothing -> Left "bad value for number of lacking copies"
where where
@ -310,6 +330,8 @@ limitUnused = MatchFiles
{ matchAction = go { matchAction = go
, matchNeedsFileName = True , matchNeedsFileName = True
, matchNeedsFileContent = False , matchNeedsFileContent = False
, matchNeedsKey = True
, matchNeedsLocationLog = False
} }
where where
go _ (MatchingFile _) = return False go _ (MatchingFile _) = return False
@ -324,6 +346,8 @@ limitAnything = MatchFiles
{ matchAction = \_ _ -> return True { matchAction = \_ _ -> return True
, matchNeedsFileName = False , matchNeedsFileName = False
, matchNeedsFileContent = False , matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
} }
{- Limit that never matches. -} {- Limit that never matches. -}
@ -332,6 +356,8 @@ limitNothing = MatchFiles
{ matchAction = \_ _ -> return False { matchAction = \_ _ -> return False
, matchNeedsFileName = False , matchNeedsFileName = False
, matchNeedsFileContent = False , matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
} }
{- Adds a limit to skip files not believed to be present in all {- Adds a limit to skip files not believed to be present in all
@ -352,6 +378,8 @@ limitInAllGroup getgroupmap groupname = Right $ MatchFiles
else checkKey (check want) mi else checkKey (check want) mi
, matchNeedsFileName = False , matchNeedsFileName = False
, matchNeedsFileContent = False , matchNeedsFileContent = False
, matchNeedsKey = True
, matchNeedsLocationLog = True
} }
where where
check want key = do check want key = do
@ -367,6 +395,8 @@ limitInBackend name = Right $ MatchFiles
{ matchAction = const $ checkKey check { matchAction = const $ checkKey check
, matchNeedsFileName = False , matchNeedsFileName = False
, matchNeedsFileContent = False , matchNeedsFileContent = False
, matchNeedsKey = True
, matchNeedsLocationLog = False
} }
where where
check key = pure $ fromKey keyVariety key == variety check key = pure $ fromKey keyVariety key == variety
@ -381,6 +411,8 @@ limitSecureHash = MatchFiles
{ matchAction = const $ checkKey isCryptographicallySecure { matchAction = const $ checkKey isCryptographicallySecure
, matchNeedsFileName = False , matchNeedsFileName = False
, matchNeedsFileContent = False , matchNeedsFileContent = False
, matchNeedsKey = True
, matchNeedsLocationLog = False
} }
{- Adds a limit to skip files that are too large or too small -} {- Adds a limit to skip files that are too large or too small -}
@ -399,6 +431,8 @@ limitSize lb vs s = case readSize dataUnits s of
LimitAnnexFiles -> False LimitAnnexFiles -> False
LimitDiskFiles -> True LimitDiskFiles -> True
, matchNeedsFileContent = False , matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
} }
where where
go sz _ (MatchingFile fi) = case lb of go sz _ (MatchingFile fi) = case lb of
@ -425,6 +459,8 @@ limitMetaData s = case parseMetaDataMatcher s of
{ matchAction = const $ checkKey (check f matching) { matchAction = const $ checkKey (check f matching)
, matchNeedsFileName = False , matchNeedsFileName = False
, matchNeedsFileContent = False , matchNeedsFileContent = False
, matchNeedsKey = True
, matchNeedsLocationLog = False
} }
where where
check f matching k = not . S.null check f matching k = not . S.null
@ -446,6 +482,8 @@ addTimeLimit duration = do
else return True else return True
, matchNeedsFileName = False , matchNeedsFileName = False
, matchNeedsFileContent = False , matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
} }
addAccessedWithin :: Duration -> Annex () addAccessedWithin :: Duration -> Annex ()
@ -455,6 +493,8 @@ addAccessedWithin duration = do
{ matchAction = const $ checkKey $ check now { matchAction = const $ checkKey $ check now
, matchNeedsFileName = False , matchNeedsFileName = False
, matchNeedsFileContent = False , matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
} }
where where
check now k = inAnnexCheck k $ \f -> check now k = inAnnexCheck k $ \f ->

View file

@ -1,6 +1,6 @@
{- git-annex limits by wanted status {- git-annex limits by wanted status
- -
- Copyright 2012 Joey Hess <id@joeyh.name> - Copyright 2012-2020 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -11,19 +11,26 @@ import Annex.Common
import Annex.Wanted import Annex.Wanted
import Limit import Limit
import Types.FileMatcher import Types.FileMatcher
import Logs.PreferredContent
addWantGet :: Annex () addWantGet :: Annex ()
addWantGet = addLimit $ Right $ MatchFiles addWantGet = addPreferredContentLimit $
{ matchAction = const $ checkWant $ wantGet False Nothing checkWant $ wantGet False Nothing
, matchNeedsFileName = False
, matchNeedsFileContent = False
}
addWantDrop :: Annex () addWantDrop :: Annex ()
addWantDrop = addLimit $ Right $ MatchFiles addWantDrop = addPreferredContentLimit $
{ matchAction = const $ checkWant $ wantDrop False Nothing Nothing checkWant $ wantDrop False Nothing Nothing
, matchNeedsFileName = False
, matchNeedsFileContent = False addPreferredContentLimit :: (MatchInfo -> Annex Bool) -> Annex ()
addPreferredContentLimit a = do
nfn <- introspectPreferredRequiredContent matchNeedsFileName Nothing
nfc <- introspectPreferredRequiredContent matchNeedsFileContent Nothing
nk <- introspectPreferredRequiredContent matchNeedsKey Nothing
addLimit $ Right $ MatchFiles
{ matchAction = const a
, matchNeedsFileName = nfn
, matchNeedsFileContent = nfc
, matchNeedsKey = nk
} }
checkWant :: (AssociatedFile -> Annex Bool) -> MatchInfo -> Annex Bool checkWant :: (AssociatedFile -> Annex Bool) -> MatchInfo -> Annex Bool

View file

@ -62,6 +62,10 @@ data MatchFiles a = MatchFiles
, matchNeedsFileContent :: Bool , matchNeedsFileContent :: Bool
-- ^ does the matchAction need the file content to be present in -- ^ does the matchAction need the file content to be present in
-- order to succeed? -- order to succeed?
, matchNeedsKey :: Bool
-- ^ does the matchAction look at information about the key?
, matchNeedsLocationLog :: Bool
-- ^ does the matchAction look at the location log?
} }
type FileMatcher a = Matcher (MatchFiles a) type FileMatcher a = Matcher (MatchFiles a)

View file

@ -11,3 +11,5 @@ log for limits that need that), otherwise before getting the key.
> So this needs a way to introspect a limit to see if the terms used in it > So this needs a way to introspect a limit to see if the terms used in it
> match some criteria. Another todo that also needs that is > match some criteria. Another todo that also needs that is
> [[sync_fast_import]] --[[Joey]] > [[sync_fast_import]] --[[Joey]]
[[done]] --[[Joey]]