seek: defer matcher check until more info is known

Sped up seeking for files to operate on, when using options like --copies
or --in, by around 20%.

Benchmark showed an increase for --copies from 155 seconds to 121
seconds, and --in remote will be similar to that.

For --in here, the speedup was less, 5-10% or so.

(both warm cache)

This commit was sponsored by Jack Hill on Patreon.
This commit is contained in:
Joey Hess 2020-09-24 17:59:05 -04:00
parent c2d1d4e16e
commit ace02f41b0
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
7 changed files with 149 additions and 66 deletions

View file

@ -269,5 +269,6 @@ call (Right sub) = Right $ Operation $ MatchFiles
matchMrun sub $ \o -> matchAction o notpresent mi
, matchNeedsFileName = any matchNeedsFileName sub
, matchNeedsFileContent = any matchNeedsFileContent sub
, matchNeedsKey = any matchNeedsKey sub
}
call (Left err) = Left err

View file

@ -21,6 +21,8 @@ git-annex (8.20200909) UNRELEASED; urgency=medium
* sync --all: Sped up seeking to around twice as fast, by avoiding a
pass over the worktree files when preferred content expressions of the
local repo and remotes don't use include=/exclude=.
* Sped up seeking for files to operate on, when using options like
--copies or --in, by around 20%
-- Joey Hess <id@joeyh.name> Mon, 14 Sep 2020 18:34:37 -0400

View file

@ -278,6 +278,18 @@ seekFiltered a fs = do
process matcher v@(_si, f) =
whenM (matcher $ MatchingFile $ FileInfo f f) (a v)
data MatcherInfo = MatcherInfo
{ matcherAction :: MatchInfo -> Annex Bool
, matcherNeedsFileName :: Bool
, matcherNeedsKey :: Bool
, matcherNeedsLocationLog :: Bool
}
checkMatcherWhen :: MatcherInfo -> Bool -> MatchInfo -> Annex () -> Annex ()
checkMatcherWhen mi c i a
| c = whenM (matcherAction mi i) a
| otherwise = a
-- This is significantly faster than using lookupKey after seekFiltered,
-- because of the way data is streamed through git cat-file.
--
@ -285,7 +297,11 @@ seekFiltered a fs = do
seekFilteredKeys :: AnnexedFileSeeker -> Annex [(SeekInput, (RawFilePath, Git.Sha, FileMode))] -> Annex ()
seekFilteredKeys seeker listfs = do
g <- Annex.gitRepo
matcher <- Limit.getMatcher
mi <- MatcherInfo
<$> Limit.getMatcher
<*> Limit.introspect matchNeedsFileName
<*> Limit.introspect matchNeedsKey
<*> Limit.introspect matchNeedsLocationLog
config <- Annex.getGitConfig
-- Run here, not in the async, because it could throw an exception
-- The list should be built lazily.
@ -293,93 +309,104 @@ seekFilteredKeys seeker listfs = do
catObjectMetaDataStream g $ \mdfeeder mdcloser mdreader ->
catObjectStream g $ \ofeeder ocloser oreader -> do
processertid <- liftIO . async =<< forkState
(process matcher ofeeder mdfeeder mdcloser False l)
(process mi ofeeder mdfeeder mdcloser False l)
mdprocessertid <- liftIO . async =<< forkState
(mdprocess matcher mdreader ofeeder ocloser)
if usesLocationLog seeker
(mdprocess mi mdreader ofeeder ocloser)
if usesLocationLog seeker || matcherNeedsLocationLog mi
then catObjectStream g $ \lfeeder lcloser lreader -> do
precachertid <- liftIO . async =<< forkState
(precacher config oreader lfeeder lcloser)
precachefinisher lreader
(precacher mi config oreader lfeeder lcloser)
precachefinisher mi lreader
join (liftIO (wait precachertid))
else finisher oreader
else finisher mi oreader
join (liftIO (wait mdprocessertid))
join (liftIO (wait processertid))
where
finisher mi oreader = liftIO oreader >>= \case
Just ((si, f), content) -> do
keyaction f mi content $
commandAction . startAction seeker si f
finisher mi oreader
Nothing -> return ()
precachefinisher mi lreader = liftIO lreader >>= \case
Just ((logf, (si, f), k), logcontent) -> do
maybe noop (Annex.BranchState.setCache logf) logcontent
checkMatcherWhen mi
(matcherNeedsLocationLog mi && not (matcherNeedsFileName mi))
(MatchingKey k (AssociatedFile (Just f)))
(commandAction $ startAction seeker si f k)
precachefinisher mi lreader
Nothing -> return ()
precacher mi config oreader lfeeder lcloser = liftIO oreader >>= \case
Just ((si, f), content) -> do
keyaction f mi content $ \k ->
let logf = locationLogFile config k
ref = Git.Ref.branchFileRef Annex.Branch.fullname logf
in liftIO $ lfeeder ((logf, (si, f), k), ref)
precacher mi config oreader lfeeder lcloser
Nothing -> liftIO $ void lcloser
feedmatches mi ofeeder si f sha = checkMatcherWhen mi
-- When the matcher needs a key or location log
-- (and does not need a worktree filename), it will be
-- checked later, to avoid a slow lookup here.
(not ((matcherNeedsKey mi || matcherNeedsLocationLog mi)
&& not (matcherNeedsFileName mi)))
(MatchingFile $ FileInfo f f)
(liftIO $ ofeeder ((si, f), sha))
keyaction f mi content a =
case parseLinkTargetOrPointerLazy =<< content of
Just k -> checkMatcherWhen mi
(matcherNeedsKey mi && not (matcherNeedsFileName mi || matcherNeedsLocationLog mi))
(MatchingKey k (AssociatedFile (Just f)))
(checkpresence k (a k))
Nothing -> noop
checkpresence k cont = case checkContentPresent seeker of
Just v -> do
present <- inAnnex k
when (present == v) cont
Nothing -> cont
finisher oreader = liftIO oreader >>= \case
Just ((si, f), content) -> do
case parseLinkTargetOrPointerLazy =<< content of
Just k -> checkpresence k $
commandAction $
startAction seeker si f k
Nothing -> noop
finisher oreader
Nothing -> return ()
precachefinisher lreader = liftIO lreader >>= \case
Just ((logf, (si, f), k), logcontent) -> do
maybe noop (Annex.BranchState.setCache logf) logcontent
commandAction $ startAction seeker si f k
precachefinisher lreader
Nothing -> return ()
precacher config oreader lfeeder lcloser = liftIO oreader >>= \case
Just ((si, f), content) -> do
case parseLinkTargetOrPointerLazy =<< content of
Just k -> checkpresence k $
let logf = locationLogFile config k
ref = Git.Ref.branchFileRef Annex.Branch.fullname logf
in liftIO $ lfeeder ((logf, (si, f), k), ref)
Nothing -> noop
precacher config oreader lfeeder lcloser
Nothing -> liftIO $ void lcloser
feedmatches matcher ofeeder si f sha =
whenM (matcher $ MatchingFile $ FileInfo f f) $
liftIO $ ofeeder ((si, f), sha)
process matcher ofeeder mdfeeder mdcloser seenpointer ((si, (f, sha, mode)):rest) =
process mi ofeeder mdfeeder mdcloser seenpointer ((si, (f, sha, mode)):rest) =
case Git.toTreeItemType mode of
Just Git.TreeSymlink -> do
whenM (exists f) $
-- Once a pointer file has been seen,
-- symlinks have to be sent via the
-- metadata processor too. That is slightly
-- slower, but preserves the requested
-- file order.
-- metadata processor too. That is
-- slightly slower, but preserves the
-- requested file order.
if seenpointer
then liftIO $ mdfeeder ((si, f), sha)
else feedmatches matcher ofeeder si f sha
process matcher ofeeder mdfeeder mdcloser seenpointer rest
else feedmatches mi ofeeder si f sha
process mi ofeeder mdfeeder mdcloser seenpointer rest
Just Git.TreeSubmodule ->
process matcher ofeeder mdfeeder mdcloser seenpointer rest
process mi ofeeder mdfeeder mdcloser seenpointer rest
-- Might be a pointer file, might be other
-- file in git, possibly large. Avoid catting
-- large files by first looking up the size.
Just _ -> do
whenM (exists f) $
liftIO $ mdfeeder ((si, f), sha)
process matcher ofeeder mdfeeder mdcloser True rest
process mi ofeeder mdfeeder mdcloser True rest
Nothing ->
process matcher ofeeder mdfeeder mdcloser seenpointer rest
process mi ofeeder mdfeeder mdcloser seenpointer rest
process _ _ _ mdcloser _ [] = liftIO $ void mdcloser
-- Check if files exist, because a deleted file will still be
-- listed by ls-tree, but should not be processed.
exists p = isJust <$> liftIO (catchMaybeIO $ R.getSymbolicLinkStatus p)
mdprocess matcher mdreader ofeeder ocloser = liftIO mdreader >>= \case
mdprocess mi mdreader ofeeder ocloser = liftIO mdreader >>= \case
Just ((si, f), Just (sha, size, _type))
| size < maxPointerSz -> do
feedmatches matcher ofeeder si f sha
mdprocess matcher mdreader ofeeder ocloser
Just _ -> mdprocess matcher mdreader ofeeder ocloser
feedmatches mi ofeeder si f sha
mdprocess mi mdreader ofeeder ocloser
Just _ -> mdprocess mi mdreader ofeeder ocloser
Nothing -> liftIO $ void ocloser
seekHelper :: (a -> RawFilePath) -> WarnUnmatchWhen -> ([LsFiles.Options] -> [RawFilePath] -> Git.Repo -> IO ([a], IO Bool)) -> WorkTreeItems -> Annex [(SeekInput, a)]

View file

@ -95,6 +95,8 @@ limitInclude glob = Right $ MatchFiles
{ matchAction = const $ matchGlobFile glob
, matchNeedsFileName = True
, matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
}
{- Add a limit to skip files that match the glob. -}
@ -106,6 +108,8 @@ limitExclude glob = Right $ MatchFiles
{ matchAction = const $ not <$$> matchGlobFile glob
, matchNeedsFileName = True
, matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
}
matchGlobFile :: String -> MatchInfo -> Annex Bool
@ -145,6 +149,8 @@ matchMagic _limitname querymagic selectprovidedinfo (Just magic) glob =
{ matchAction = const go
, matchNeedsFileName = True
, matchNeedsFileContent = True
, matchNeedsKey = False
, matchNeedsLocationLog = False
}
where
cglob = compileGlob glob CaseSensative -- memoized
@ -162,6 +168,8 @@ addUnlocked = addLimit $ Right $ MatchFiles
{ matchAction = const $ matchLockStatus False
, matchNeedsFileName = True
, matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
}
addLocked :: Annex ()
@ -169,6 +177,8 @@ addLocked = addLimit $ Right $ MatchFiles
{ matchAction = const $ matchLockStatus True
, matchNeedsFileName = True
, matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
}
matchLockStatus :: Bool -> MatchInfo -> Annex Bool
@ -188,14 +198,16 @@ addIn s = do
u <- Remote.nameToUUID name
hereu <- getUUID
addLimit $ if u == hereu && null date
then use inhere
else use (inuuid u)
then use True inhere
else use False (inuuid u)
where
(name, date) = separate (== '@') s
use a = Right $ MatchFiles
use inhere a = Right $ MatchFiles
{ matchAction = checkKey . a
, matchNeedsFileName = False
, matchNeedsFileContent = False
, matchNeedsKey = True
, matchNeedsLocationLog = not inhere
}
inuuid u notpresent key
| null date = do
@ -224,6 +236,8 @@ limitPresent u = MatchFiles
return $ maybe False (`elem` us) u
, matchNeedsFileName = False
, matchNeedsFileContent = False
, matchNeedsKey = True
, matchNeedsLocationLog = not (isNothing u)
}
{- Limit to content that is in a directory, anywhere in the repository tree -}
@ -232,6 +246,8 @@ limitInDir dir = MatchFiles
{ matchAction = const go
, matchNeedsFileName = True
, matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
}
where
go (MatchingFile fi) = checkf $ fromRawFilePath $ matchFile fi
@ -262,6 +278,8 @@ limitCopies want = case splitc ':' want of
go' n good notpresent
, matchNeedsFileName = False
, matchNeedsFileContent = False
, matchNeedsKey = True
, matchNeedsLocationLog = True
}
go' n good notpresent key = do
us <- filter (`S.notMember` notpresent)
@ -284,6 +302,8 @@ limitLackingCopies approx want = case readish want of
go mi needed notpresent
, matchNeedsFileName = False
, matchNeedsFileContent = False
, matchNeedsKey = True
, matchNeedsLocationLog = True
}
Nothing -> Left "bad value for number of lacking copies"
where
@ -310,6 +330,8 @@ limitUnused = MatchFiles
{ matchAction = go
, matchNeedsFileName = True
, matchNeedsFileContent = False
, matchNeedsKey = True
, matchNeedsLocationLog = False
}
where
go _ (MatchingFile _) = return False
@ -324,6 +346,8 @@ limitAnything = MatchFiles
{ matchAction = \_ _ -> return True
, matchNeedsFileName = False
, matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
}
{- Limit that never matches. -}
@ -332,6 +356,8 @@ limitNothing = MatchFiles
{ matchAction = \_ _ -> return False
, matchNeedsFileName = False
, matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
}
{- Adds a limit to skip files not believed to be present in all
@ -352,6 +378,8 @@ limitInAllGroup getgroupmap groupname = Right $ MatchFiles
else checkKey (check want) mi
, matchNeedsFileName = False
, matchNeedsFileContent = False
, matchNeedsKey = True
, matchNeedsLocationLog = True
}
where
check want key = do
@ -367,6 +395,8 @@ limitInBackend name = Right $ MatchFiles
{ matchAction = const $ checkKey check
, matchNeedsFileName = False
, matchNeedsFileContent = False
, matchNeedsKey = True
, matchNeedsLocationLog = False
}
where
check key = pure $ fromKey keyVariety key == variety
@ -381,6 +411,8 @@ limitSecureHash = MatchFiles
{ matchAction = const $ checkKey isCryptographicallySecure
, matchNeedsFileName = False
, matchNeedsFileContent = False
, matchNeedsKey = True
, matchNeedsLocationLog = False
}
{- Adds a limit to skip files that are too large or too small -}
@ -399,6 +431,8 @@ limitSize lb vs s = case readSize dataUnits s of
LimitAnnexFiles -> False
LimitDiskFiles -> True
, matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
}
where
go sz _ (MatchingFile fi) = case lb of
@ -425,6 +459,8 @@ limitMetaData s = case parseMetaDataMatcher s of
{ matchAction = const $ checkKey (check f matching)
, matchNeedsFileName = False
, matchNeedsFileContent = False
, matchNeedsKey = True
, matchNeedsLocationLog = False
}
where
check f matching k = not . S.null
@ -446,6 +482,8 @@ addTimeLimit duration = do
else return True
, matchNeedsFileName = False
, matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
}
addAccessedWithin :: Duration -> Annex ()
@ -455,6 +493,8 @@ addAccessedWithin duration = do
{ matchAction = const $ checkKey $ check now
, matchNeedsFileName = False
, matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
}
where
check now k = inAnnexCheck k $ \f ->

View file

@ -1,6 +1,6 @@
{- git-annex limits by wanted status
-
- Copyright 2012 Joey Hess <id@joeyh.name>
- Copyright 2012-2020 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -11,19 +11,26 @@ import Annex.Common
import Annex.Wanted
import Limit
import Types.FileMatcher
import Logs.PreferredContent
addWantGet :: Annex ()
addWantGet = addLimit $ Right $ MatchFiles
{ matchAction = const $ checkWant $ wantGet False Nothing
, matchNeedsFileName = False
, matchNeedsFileContent = False
}
addWantGet = addPreferredContentLimit $
checkWant $ wantGet False Nothing
addWantDrop :: Annex ()
addWantDrop = addLimit $ Right $ MatchFiles
{ matchAction = const $ checkWant $ wantDrop False Nothing Nothing
, matchNeedsFileName = False
, matchNeedsFileContent = False
addWantDrop = addPreferredContentLimit $
checkWant $ wantDrop False Nothing Nothing
addPreferredContentLimit :: (MatchInfo -> Annex Bool) -> Annex ()
addPreferredContentLimit a = do
nfn <- introspectPreferredRequiredContent matchNeedsFileName Nothing
nfc <- introspectPreferredRequiredContent matchNeedsFileContent Nothing
nk <- introspectPreferredRequiredContent matchNeedsKey Nothing
addLimit $ Right $ MatchFiles
{ matchAction = const a
, matchNeedsFileName = nfn
, matchNeedsFileContent = nfc
, matchNeedsKey = nk
}
checkWant :: (AssociatedFile -> Annex Bool) -> MatchInfo -> Annex Bool

View file

@ -62,6 +62,10 @@ data MatchFiles a = MatchFiles
, matchNeedsFileContent :: Bool
-- ^ does the matchAction need the file content to be present in
-- order to succeed?
, matchNeedsKey :: Bool
-- ^ does the matchAction look at information about the key?
, matchNeedsLocationLog :: Bool
-- ^ does the matchAction look at the location log?
}
type FileMatcher a = Matcher (MatchFiles a)

View file

@ -11,3 +11,5 @@ log for limits that need that), otherwise before getting the key.
> So this needs a way to introspect a limit to see if the terms used in it
> match some criteria. Another todo that also needs that is
> [[sync_fast_import]] --[[Joey]]
[[done]] --[[Joey]]