seek: defer matcher check until more info is known
Sped up seeking for files to operate on, when using options like --copies or --in, by around 20%. Benchmark showed an increase for --copies from 155 seconds to 121 seconds, and --in remote will be similar to that. For --in here, the speedup was less, 5-10% or so. (both warm cache) This commit was sponsored by Jack Hill on Patreon.
This commit is contained in:
parent
c2d1d4e16e
commit
ace02f41b0
7 changed files with 149 additions and 66 deletions
|
@ -269,5 +269,6 @@ call (Right sub) = Right $ Operation $ MatchFiles
|
|||
matchMrun sub $ \o -> matchAction o notpresent mi
|
||||
, matchNeedsFileName = any matchNeedsFileName sub
|
||||
, matchNeedsFileContent = any matchNeedsFileContent sub
|
||||
, matchNeedsKey = any matchNeedsKey sub
|
||||
}
|
||||
call (Left err) = Left err
|
||||
|
|
|
@ -21,6 +21,8 @@ git-annex (8.20200909) UNRELEASED; urgency=medium
|
|||
* sync --all: Sped up seeking to around twice as fast, by avoiding a
|
||||
pass over the worktree files when preferred content expressions of the
|
||||
local repo and remotes don't use include=/exclude=.
|
||||
* Sped up seeking for files to operate on, when using options like
|
||||
--copies or --in, by around 20%
|
||||
|
||||
-- Joey Hess <id@joeyh.name> Mon, 14 Sep 2020 18:34:37 -0400
|
||||
|
||||
|
|
131
CmdLine/Seek.hs
131
CmdLine/Seek.hs
|
@ -278,6 +278,18 @@ seekFiltered a fs = do
|
|||
process matcher v@(_si, f) =
|
||||
whenM (matcher $ MatchingFile $ FileInfo f f) (a v)
|
||||
|
||||
data MatcherInfo = MatcherInfo
|
||||
{ matcherAction :: MatchInfo -> Annex Bool
|
||||
, matcherNeedsFileName :: Bool
|
||||
, matcherNeedsKey :: Bool
|
||||
, matcherNeedsLocationLog :: Bool
|
||||
}
|
||||
|
||||
checkMatcherWhen :: MatcherInfo -> Bool -> MatchInfo -> Annex () -> Annex ()
|
||||
checkMatcherWhen mi c i a
|
||||
| c = whenM (matcherAction mi i) a
|
||||
| otherwise = a
|
||||
|
||||
-- This is significantly faster than using lookupKey after seekFiltered,
|
||||
-- because of the way data is streamed through git cat-file.
|
||||
--
|
||||
|
@ -285,7 +297,11 @@ seekFiltered a fs = do
|
|||
seekFilteredKeys :: AnnexedFileSeeker -> Annex [(SeekInput, (RawFilePath, Git.Sha, FileMode))] -> Annex ()
|
||||
seekFilteredKeys seeker listfs = do
|
||||
g <- Annex.gitRepo
|
||||
matcher <- Limit.getMatcher
|
||||
mi <- MatcherInfo
|
||||
<$> Limit.getMatcher
|
||||
<*> Limit.introspect matchNeedsFileName
|
||||
<*> Limit.introspect matchNeedsKey
|
||||
<*> Limit.introspect matchNeedsLocationLog
|
||||
config <- Annex.getGitConfig
|
||||
-- Run here, not in the async, because it could throw an exception
|
||||
-- The list should be built lazily.
|
||||
|
@ -293,93 +309,104 @@ seekFilteredKeys seeker listfs = do
|
|||
catObjectMetaDataStream g $ \mdfeeder mdcloser mdreader ->
|
||||
catObjectStream g $ \ofeeder ocloser oreader -> do
|
||||
processertid <- liftIO . async =<< forkState
|
||||
(process matcher ofeeder mdfeeder mdcloser False l)
|
||||
(process mi ofeeder mdfeeder mdcloser False l)
|
||||
mdprocessertid <- liftIO . async =<< forkState
|
||||
(mdprocess matcher mdreader ofeeder ocloser)
|
||||
if usesLocationLog seeker
|
||||
(mdprocess mi mdreader ofeeder ocloser)
|
||||
if usesLocationLog seeker || matcherNeedsLocationLog mi
|
||||
then catObjectStream g $ \lfeeder lcloser lreader -> do
|
||||
precachertid <- liftIO . async =<< forkState
|
||||
(precacher config oreader lfeeder lcloser)
|
||||
precachefinisher lreader
|
||||
(precacher mi config oreader lfeeder lcloser)
|
||||
precachefinisher mi lreader
|
||||
join (liftIO (wait precachertid))
|
||||
else finisher oreader
|
||||
else finisher mi oreader
|
||||
join (liftIO (wait mdprocessertid))
|
||||
join (liftIO (wait processertid))
|
||||
where
|
||||
finisher mi oreader = liftIO oreader >>= \case
|
||||
Just ((si, f), content) -> do
|
||||
keyaction f mi content $
|
||||
commandAction . startAction seeker si f
|
||||
finisher mi oreader
|
||||
Nothing -> return ()
|
||||
|
||||
precachefinisher mi lreader = liftIO lreader >>= \case
|
||||
Just ((logf, (si, f), k), logcontent) -> do
|
||||
maybe noop (Annex.BranchState.setCache logf) logcontent
|
||||
checkMatcherWhen mi
|
||||
(matcherNeedsLocationLog mi && not (matcherNeedsFileName mi))
|
||||
(MatchingKey k (AssociatedFile (Just f)))
|
||||
(commandAction $ startAction seeker si f k)
|
||||
precachefinisher mi lreader
|
||||
Nothing -> return ()
|
||||
|
||||
precacher mi config oreader lfeeder lcloser = liftIO oreader >>= \case
|
||||
Just ((si, f), content) -> do
|
||||
keyaction f mi content $ \k ->
|
||||
let logf = locationLogFile config k
|
||||
ref = Git.Ref.branchFileRef Annex.Branch.fullname logf
|
||||
in liftIO $ lfeeder ((logf, (si, f), k), ref)
|
||||
precacher mi config oreader lfeeder lcloser
|
||||
Nothing -> liftIO $ void lcloser
|
||||
|
||||
feedmatches mi ofeeder si f sha = checkMatcherWhen mi
|
||||
-- When the matcher needs a key or location log
|
||||
-- (and does not need a worktree filename), it will be
|
||||
-- checked later, to avoid a slow lookup here.
|
||||
(not ((matcherNeedsKey mi || matcherNeedsLocationLog mi)
|
||||
&& not (matcherNeedsFileName mi)))
|
||||
(MatchingFile $ FileInfo f f)
|
||||
(liftIO $ ofeeder ((si, f), sha))
|
||||
|
||||
keyaction f mi content a =
|
||||
case parseLinkTargetOrPointerLazy =<< content of
|
||||
Just k -> checkMatcherWhen mi
|
||||
(matcherNeedsKey mi && not (matcherNeedsFileName mi || matcherNeedsLocationLog mi))
|
||||
(MatchingKey k (AssociatedFile (Just f)))
|
||||
(checkpresence k (a k))
|
||||
Nothing -> noop
|
||||
|
||||
checkpresence k cont = case checkContentPresent seeker of
|
||||
Just v -> do
|
||||
present <- inAnnex k
|
||||
when (present == v) cont
|
||||
Nothing -> cont
|
||||
|
||||
finisher oreader = liftIO oreader >>= \case
|
||||
Just ((si, f), content) -> do
|
||||
case parseLinkTargetOrPointerLazy =<< content of
|
||||
Just k -> checkpresence k $
|
||||
commandAction $
|
||||
startAction seeker si f k
|
||||
Nothing -> noop
|
||||
finisher oreader
|
||||
Nothing -> return ()
|
||||
|
||||
precachefinisher lreader = liftIO lreader >>= \case
|
||||
Just ((logf, (si, f), k), logcontent) -> do
|
||||
maybe noop (Annex.BranchState.setCache logf) logcontent
|
||||
commandAction $ startAction seeker si f k
|
||||
precachefinisher lreader
|
||||
Nothing -> return ()
|
||||
|
||||
precacher config oreader lfeeder lcloser = liftIO oreader >>= \case
|
||||
Just ((si, f), content) -> do
|
||||
case parseLinkTargetOrPointerLazy =<< content of
|
||||
Just k -> checkpresence k $
|
||||
let logf = locationLogFile config k
|
||||
ref = Git.Ref.branchFileRef Annex.Branch.fullname logf
|
||||
in liftIO $ lfeeder ((logf, (si, f), k), ref)
|
||||
Nothing -> noop
|
||||
precacher config oreader lfeeder lcloser
|
||||
Nothing -> liftIO $ void lcloser
|
||||
|
||||
feedmatches matcher ofeeder si f sha =
|
||||
whenM (matcher $ MatchingFile $ FileInfo f f) $
|
||||
liftIO $ ofeeder ((si, f), sha)
|
||||
|
||||
process matcher ofeeder mdfeeder mdcloser seenpointer ((si, (f, sha, mode)):rest) =
|
||||
process mi ofeeder mdfeeder mdcloser seenpointer ((si, (f, sha, mode)):rest) =
|
||||
case Git.toTreeItemType mode of
|
||||
Just Git.TreeSymlink -> do
|
||||
whenM (exists f) $
|
||||
-- Once a pointer file has been seen,
|
||||
-- symlinks have to be sent via the
|
||||
-- metadata processor too. That is slightly
|
||||
-- slower, but preserves the requested
|
||||
-- file order.
|
||||
-- metadata processor too. That is
|
||||
-- slightly slower, but preserves the
|
||||
-- requested file order.
|
||||
if seenpointer
|
||||
then liftIO $ mdfeeder ((si, f), sha)
|
||||
else feedmatches matcher ofeeder si f sha
|
||||
process matcher ofeeder mdfeeder mdcloser seenpointer rest
|
||||
else feedmatches mi ofeeder si f sha
|
||||
process mi ofeeder mdfeeder mdcloser seenpointer rest
|
||||
Just Git.TreeSubmodule ->
|
||||
process matcher ofeeder mdfeeder mdcloser seenpointer rest
|
||||
process mi ofeeder mdfeeder mdcloser seenpointer rest
|
||||
-- Might be a pointer file, might be other
|
||||
-- file in git, possibly large. Avoid catting
|
||||
-- large files by first looking up the size.
|
||||
Just _ -> do
|
||||
whenM (exists f) $
|
||||
liftIO $ mdfeeder ((si, f), sha)
|
||||
process matcher ofeeder mdfeeder mdcloser True rest
|
||||
process mi ofeeder mdfeeder mdcloser True rest
|
||||
Nothing ->
|
||||
process matcher ofeeder mdfeeder mdcloser seenpointer rest
|
||||
process mi ofeeder mdfeeder mdcloser seenpointer rest
|
||||
process _ _ _ mdcloser _ [] = liftIO $ void mdcloser
|
||||
|
||||
-- Check if files exist, because a deleted file will still be
|
||||
-- listed by ls-tree, but should not be processed.
|
||||
exists p = isJust <$> liftIO (catchMaybeIO $ R.getSymbolicLinkStatus p)
|
||||
|
||||
mdprocess matcher mdreader ofeeder ocloser = liftIO mdreader >>= \case
|
||||
mdprocess mi mdreader ofeeder ocloser = liftIO mdreader >>= \case
|
||||
Just ((si, f), Just (sha, size, _type))
|
||||
| size < maxPointerSz -> do
|
||||
feedmatches matcher ofeeder si f sha
|
||||
mdprocess matcher mdreader ofeeder ocloser
|
||||
Just _ -> mdprocess matcher mdreader ofeeder ocloser
|
||||
feedmatches mi ofeeder si f sha
|
||||
mdprocess mi mdreader ofeeder ocloser
|
||||
Just _ -> mdprocess mi mdreader ofeeder ocloser
|
||||
Nothing -> liftIO $ void ocloser
|
||||
|
||||
seekHelper :: (a -> RawFilePath) -> WarnUnmatchWhen -> ([LsFiles.Options] -> [RawFilePath] -> Git.Repo -> IO ([a], IO Bool)) -> WorkTreeItems -> Annex [(SeekInput, a)]
|
||||
|
|
46
Limit.hs
46
Limit.hs
|
@ -95,6 +95,8 @@ limitInclude glob = Right $ MatchFiles
|
|||
{ matchAction = const $ matchGlobFile glob
|
||||
, matchNeedsFileName = True
|
||||
, matchNeedsFileContent = False
|
||||
, matchNeedsKey = False
|
||||
, matchNeedsLocationLog = False
|
||||
}
|
||||
|
||||
{- Add a limit to skip files that match the glob. -}
|
||||
|
@ -106,6 +108,8 @@ limitExclude glob = Right $ MatchFiles
|
|||
{ matchAction = const $ not <$$> matchGlobFile glob
|
||||
, matchNeedsFileName = True
|
||||
, matchNeedsFileContent = False
|
||||
, matchNeedsKey = False
|
||||
, matchNeedsLocationLog = False
|
||||
}
|
||||
|
||||
matchGlobFile :: String -> MatchInfo -> Annex Bool
|
||||
|
@ -145,6 +149,8 @@ matchMagic _limitname querymagic selectprovidedinfo (Just magic) glob =
|
|||
{ matchAction = const go
|
||||
, matchNeedsFileName = True
|
||||
, matchNeedsFileContent = True
|
||||
, matchNeedsKey = False
|
||||
, matchNeedsLocationLog = False
|
||||
}
|
||||
where
|
||||
cglob = compileGlob glob CaseSensative -- memoized
|
||||
|
@ -162,6 +168,8 @@ addUnlocked = addLimit $ Right $ MatchFiles
|
|||
{ matchAction = const $ matchLockStatus False
|
||||
, matchNeedsFileName = True
|
||||
, matchNeedsFileContent = False
|
||||
, matchNeedsKey = False
|
||||
, matchNeedsLocationLog = False
|
||||
}
|
||||
|
||||
addLocked :: Annex ()
|
||||
|
@ -169,6 +177,8 @@ addLocked = addLimit $ Right $ MatchFiles
|
|||
{ matchAction = const $ matchLockStatus True
|
||||
, matchNeedsFileName = True
|
||||
, matchNeedsFileContent = False
|
||||
, matchNeedsKey = False
|
||||
, matchNeedsLocationLog = False
|
||||
}
|
||||
|
||||
matchLockStatus :: Bool -> MatchInfo -> Annex Bool
|
||||
|
@ -188,14 +198,16 @@ addIn s = do
|
|||
u <- Remote.nameToUUID name
|
||||
hereu <- getUUID
|
||||
addLimit $ if u == hereu && null date
|
||||
then use inhere
|
||||
else use (inuuid u)
|
||||
then use True inhere
|
||||
else use False (inuuid u)
|
||||
where
|
||||
(name, date) = separate (== '@') s
|
||||
use a = Right $ MatchFiles
|
||||
use inhere a = Right $ MatchFiles
|
||||
{ matchAction = checkKey . a
|
||||
, matchNeedsFileName = False
|
||||
, matchNeedsFileContent = False
|
||||
, matchNeedsKey = True
|
||||
, matchNeedsLocationLog = not inhere
|
||||
}
|
||||
inuuid u notpresent key
|
||||
| null date = do
|
||||
|
@ -224,6 +236,8 @@ limitPresent u = MatchFiles
|
|||
return $ maybe False (`elem` us) u
|
||||
, matchNeedsFileName = False
|
||||
, matchNeedsFileContent = False
|
||||
, matchNeedsKey = True
|
||||
, matchNeedsLocationLog = not (isNothing u)
|
||||
}
|
||||
|
||||
{- Limit to content that is in a directory, anywhere in the repository tree -}
|
||||
|
@ -232,6 +246,8 @@ limitInDir dir = MatchFiles
|
|||
{ matchAction = const go
|
||||
, matchNeedsFileName = True
|
||||
, matchNeedsFileContent = False
|
||||
, matchNeedsKey = False
|
||||
, matchNeedsLocationLog = False
|
||||
}
|
||||
where
|
||||
go (MatchingFile fi) = checkf $ fromRawFilePath $ matchFile fi
|
||||
|
@ -262,6 +278,8 @@ limitCopies want = case splitc ':' want of
|
|||
go' n good notpresent
|
||||
, matchNeedsFileName = False
|
||||
, matchNeedsFileContent = False
|
||||
, matchNeedsKey = True
|
||||
, matchNeedsLocationLog = True
|
||||
}
|
||||
go' n good notpresent key = do
|
||||
us <- filter (`S.notMember` notpresent)
|
||||
|
@ -284,6 +302,8 @@ limitLackingCopies approx want = case readish want of
|
|||
go mi needed notpresent
|
||||
, matchNeedsFileName = False
|
||||
, matchNeedsFileContent = False
|
||||
, matchNeedsKey = True
|
||||
, matchNeedsLocationLog = True
|
||||
}
|
||||
Nothing -> Left "bad value for number of lacking copies"
|
||||
where
|
||||
|
@ -310,6 +330,8 @@ limitUnused = MatchFiles
|
|||
{ matchAction = go
|
||||
, matchNeedsFileName = True
|
||||
, matchNeedsFileContent = False
|
||||
, matchNeedsKey = True
|
||||
, matchNeedsLocationLog = False
|
||||
}
|
||||
where
|
||||
go _ (MatchingFile _) = return False
|
||||
|
@ -324,6 +346,8 @@ limitAnything = MatchFiles
|
|||
{ matchAction = \_ _ -> return True
|
||||
, matchNeedsFileName = False
|
||||
, matchNeedsFileContent = False
|
||||
, matchNeedsKey = False
|
||||
, matchNeedsLocationLog = False
|
||||
}
|
||||
|
||||
{- Limit that never matches. -}
|
||||
|
@ -332,6 +356,8 @@ limitNothing = MatchFiles
|
|||
{ matchAction = \_ _ -> return False
|
||||
, matchNeedsFileName = False
|
||||
, matchNeedsFileContent = False
|
||||
, matchNeedsKey = False
|
||||
, matchNeedsLocationLog = False
|
||||
}
|
||||
|
||||
{- Adds a limit to skip files not believed to be present in all
|
||||
|
@ -352,6 +378,8 @@ limitInAllGroup getgroupmap groupname = Right $ MatchFiles
|
|||
else checkKey (check want) mi
|
||||
, matchNeedsFileName = False
|
||||
, matchNeedsFileContent = False
|
||||
, matchNeedsKey = True
|
||||
, matchNeedsLocationLog = True
|
||||
}
|
||||
where
|
||||
check want key = do
|
||||
|
@ -367,6 +395,8 @@ limitInBackend name = Right $ MatchFiles
|
|||
{ matchAction = const $ checkKey check
|
||||
, matchNeedsFileName = False
|
||||
, matchNeedsFileContent = False
|
||||
, matchNeedsKey = True
|
||||
, matchNeedsLocationLog = False
|
||||
}
|
||||
where
|
||||
check key = pure $ fromKey keyVariety key == variety
|
||||
|
@ -381,6 +411,8 @@ limitSecureHash = MatchFiles
|
|||
{ matchAction = const $ checkKey isCryptographicallySecure
|
||||
, matchNeedsFileName = False
|
||||
, matchNeedsFileContent = False
|
||||
, matchNeedsKey = True
|
||||
, matchNeedsLocationLog = False
|
||||
}
|
||||
|
||||
{- Adds a limit to skip files that are too large or too small -}
|
||||
|
@ -399,6 +431,8 @@ limitSize lb vs s = case readSize dataUnits s of
|
|||
LimitAnnexFiles -> False
|
||||
LimitDiskFiles -> True
|
||||
, matchNeedsFileContent = False
|
||||
, matchNeedsKey = False
|
||||
, matchNeedsLocationLog = False
|
||||
}
|
||||
where
|
||||
go sz _ (MatchingFile fi) = case lb of
|
||||
|
@ -425,6 +459,8 @@ limitMetaData s = case parseMetaDataMatcher s of
|
|||
{ matchAction = const $ checkKey (check f matching)
|
||||
, matchNeedsFileName = False
|
||||
, matchNeedsFileContent = False
|
||||
, matchNeedsKey = True
|
||||
, matchNeedsLocationLog = False
|
||||
}
|
||||
where
|
||||
check f matching k = not . S.null
|
||||
|
@ -446,6 +482,8 @@ addTimeLimit duration = do
|
|||
else return True
|
||||
, matchNeedsFileName = False
|
||||
, matchNeedsFileContent = False
|
||||
, matchNeedsKey = False
|
||||
, matchNeedsLocationLog = False
|
||||
}
|
||||
|
||||
addAccessedWithin :: Duration -> Annex ()
|
||||
|
@ -455,6 +493,8 @@ addAccessedWithin duration = do
|
|||
{ matchAction = const $ checkKey $ check now
|
||||
, matchNeedsFileName = False
|
||||
, matchNeedsFileContent = False
|
||||
, matchNeedsKey = False
|
||||
, matchNeedsLocationLog = False
|
||||
}
|
||||
where
|
||||
check now k = inAnnexCheck k $ \f ->
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{- git-annex limits by wanted status
|
||||
-
|
||||
- Copyright 2012 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2012-2020 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
@ -11,20 +11,27 @@ import Annex.Common
|
|||
import Annex.Wanted
|
||||
import Limit
|
||||
import Types.FileMatcher
|
||||
import Logs.PreferredContent
|
||||
|
||||
addWantGet :: Annex ()
|
||||
addWantGet = addLimit $ Right $ MatchFiles
|
||||
{ matchAction = const $ checkWant $ wantGet False Nothing
|
||||
, matchNeedsFileName = False
|
||||
, matchNeedsFileContent = False
|
||||
}
|
||||
addWantGet = addPreferredContentLimit $
|
||||
checkWant $ wantGet False Nothing
|
||||
|
||||
addWantDrop :: Annex ()
|
||||
addWantDrop = addLimit $ Right $ MatchFiles
|
||||
{ matchAction = const $ checkWant $ wantDrop False Nothing Nothing
|
||||
, matchNeedsFileName = False
|
||||
, matchNeedsFileContent = False
|
||||
}
|
||||
addWantDrop = addPreferredContentLimit $
|
||||
checkWant $ wantDrop False Nothing Nothing
|
||||
|
||||
addPreferredContentLimit :: (MatchInfo -> Annex Bool) -> Annex ()
|
||||
addPreferredContentLimit a = do
|
||||
nfn <- introspectPreferredRequiredContent matchNeedsFileName Nothing
|
||||
nfc <- introspectPreferredRequiredContent matchNeedsFileContent Nothing
|
||||
nk <- introspectPreferredRequiredContent matchNeedsKey Nothing
|
||||
addLimit $ Right $ MatchFiles
|
||||
{ matchAction = const a
|
||||
, matchNeedsFileName = nfn
|
||||
, matchNeedsFileContent = nfc
|
||||
, matchNeedsKey = nk
|
||||
}
|
||||
|
||||
checkWant :: (AssociatedFile -> Annex Bool) -> MatchInfo -> Annex Bool
|
||||
checkWant a (MatchingFile fi) = a (AssociatedFile (Just $ matchFile fi))
|
||||
|
|
|
@ -62,6 +62,10 @@ data MatchFiles a = MatchFiles
|
|||
, matchNeedsFileContent :: Bool
|
||||
-- ^ does the matchAction need the file content to be present in
|
||||
-- order to succeed?
|
||||
, matchNeedsKey :: Bool
|
||||
-- ^ does the matchAction look at information about the key?
|
||||
, matchNeedsLocationLog :: Bool
|
||||
-- ^ does the matchAction look at the location log?
|
||||
}
|
||||
|
||||
type FileMatcher a = Matcher (MatchFiles a)
|
||||
|
|
|
@ -11,3 +11,5 @@ log for limits that need that), otherwise before getting the key.
|
|||
> So this needs a way to introspect a limit to see if the terms used in it
|
||||
> match some criteria. Another todo that also needs that is
|
||||
> [[sync_fast_import]] --[[Joey]]
|
||||
|
||||
[[done]] --[[Joey]]
|
||||
|
|
Loading…
Reference in a new issue