diff --git a/Annex/FileMatcher.hs b/Annex/FileMatcher.hs index 73e6706480..7509eb9310 100644 --- a/Annex/FileMatcher.hs +++ b/Annex/FileMatcher.hs @@ -269,5 +269,6 @@ call (Right sub) = Right $ Operation $ MatchFiles matchMrun sub $ \o -> matchAction o notpresent mi , matchNeedsFileName = any matchNeedsFileName sub , matchNeedsFileContent = any matchNeedsFileContent sub + , matchNeedsKey = any matchNeedsKey sub } call (Left err) = Left err diff --git a/CHANGELOG b/CHANGELOG index 171a3b190e..730a0b876c 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -21,6 +21,8 @@ git-annex (8.20200909) UNRELEASED; urgency=medium * sync --all: Sped up seeking to around twice as fast, by avoiding a pass over the worktree files when preferred content expressions of the local repo and remotes don't use include=/exclude=. + * Sped up seeking for files to operate on, when using options like + --copies or --in, by around 20% -- Joey Hess Mon, 14 Sep 2020 18:34:37 -0400 diff --git a/CmdLine/Seek.hs b/CmdLine/Seek.hs index 42ab541a96..a5631cfb47 100644 --- a/CmdLine/Seek.hs +++ b/CmdLine/Seek.hs @@ -278,6 +278,18 @@ seekFiltered a fs = do process matcher v@(_si, f) = whenM (matcher $ MatchingFile $ FileInfo f f) (a v) +data MatcherInfo = MatcherInfo + { matcherAction :: MatchInfo -> Annex Bool + , matcherNeedsFileName :: Bool + , matcherNeedsKey :: Bool + , matcherNeedsLocationLog :: Bool + } + +checkMatcherWhen :: MatcherInfo -> Bool -> MatchInfo -> Annex () -> Annex () +checkMatcherWhen mi c i a + | c = whenM (matcherAction mi i) a + | otherwise = a + -- This is significantly faster than using lookupKey after seekFiltered, -- because of the way data is streamed through git cat-file. -- @@ -285,7 +297,11 @@ seekFiltered a fs = do seekFilteredKeys :: AnnexedFileSeeker -> Annex [(SeekInput, (RawFilePath, Git.Sha, FileMode))] -> Annex () seekFilteredKeys seeker listfs = do g <- Annex.gitRepo - matcher <- Limit.getMatcher + mi <- MatcherInfo + <$> Limit.getMatcher + <*> Limit.introspect matchNeedsFileName + <*> Limit.introspect matchNeedsKey + <*> Limit.introspect matchNeedsLocationLog config <- Annex.getGitConfig -- Run here, not in the async, because it could throw an exception -- The list should be built lazily. @@ -293,93 +309,104 @@ seekFilteredKeys seeker listfs = do catObjectMetaDataStream g $ \mdfeeder mdcloser mdreader -> catObjectStream g $ \ofeeder ocloser oreader -> do processertid <- liftIO . async =<< forkState - (process matcher ofeeder mdfeeder mdcloser False l) + (process mi ofeeder mdfeeder mdcloser False l) mdprocessertid <- liftIO . async =<< forkState - (mdprocess matcher mdreader ofeeder ocloser) - if usesLocationLog seeker + (mdprocess mi mdreader ofeeder ocloser) + if usesLocationLog seeker || matcherNeedsLocationLog mi then catObjectStream g $ \lfeeder lcloser lreader -> do precachertid <- liftIO . async =<< forkState - (precacher config oreader lfeeder lcloser) - precachefinisher lreader + (precacher mi config oreader lfeeder lcloser) + precachefinisher mi lreader join (liftIO (wait precachertid)) - else finisher oreader + else finisher mi oreader join (liftIO (wait mdprocessertid)) join (liftIO (wait processertid)) where + finisher mi oreader = liftIO oreader >>= \case + Just ((si, f), content) -> do + keyaction f mi content $ + commandAction . startAction seeker si f + finisher mi oreader + Nothing -> return () + + precachefinisher mi lreader = liftIO lreader >>= \case + Just ((logf, (si, f), k), logcontent) -> do + maybe noop (Annex.BranchState.setCache logf) logcontent + checkMatcherWhen mi + (matcherNeedsLocationLog mi && not (matcherNeedsFileName mi)) + (MatchingKey k (AssociatedFile (Just f))) + (commandAction $ startAction seeker si f k) + precachefinisher mi lreader + Nothing -> return () + + precacher mi config oreader lfeeder lcloser = liftIO oreader >>= \case + Just ((si, f), content) -> do + keyaction f mi content $ \k -> + let logf = locationLogFile config k + ref = Git.Ref.branchFileRef Annex.Branch.fullname logf + in liftIO $ lfeeder ((logf, (si, f), k), ref) + precacher mi config oreader lfeeder lcloser + Nothing -> liftIO $ void lcloser + + feedmatches mi ofeeder si f sha = checkMatcherWhen mi + -- When the matcher needs a key or location log + -- (and does not need a worktree filename), it will be + -- checked later, to avoid a slow lookup here. + (not ((matcherNeedsKey mi || matcherNeedsLocationLog mi) + && not (matcherNeedsFileName mi))) + (MatchingFile $ FileInfo f f) + (liftIO $ ofeeder ((si, f), sha)) + + keyaction f mi content a = + case parseLinkTargetOrPointerLazy =<< content of + Just k -> checkMatcherWhen mi + (matcherNeedsKey mi && not (matcherNeedsFileName mi || matcherNeedsLocationLog mi)) + (MatchingKey k (AssociatedFile (Just f))) + (checkpresence k (a k)) + Nothing -> noop + checkpresence k cont = case checkContentPresent seeker of Just v -> do present <- inAnnex k when (present == v) cont Nothing -> cont - finisher oreader = liftIO oreader >>= \case - Just ((si, f), content) -> do - case parseLinkTargetOrPointerLazy =<< content of - Just k -> checkpresence k $ - commandAction $ - startAction seeker si f k - Nothing -> noop - finisher oreader - Nothing -> return () - - precachefinisher lreader = liftIO lreader >>= \case - Just ((logf, (si, f), k), logcontent) -> do - maybe noop (Annex.BranchState.setCache logf) logcontent - commandAction $ startAction seeker si f k - precachefinisher lreader - Nothing -> return () - - precacher config oreader lfeeder lcloser = liftIO oreader >>= \case - Just ((si, f), content) -> do - case parseLinkTargetOrPointerLazy =<< content of - Just k -> checkpresence k $ - let logf = locationLogFile config k - ref = Git.Ref.branchFileRef Annex.Branch.fullname logf - in liftIO $ lfeeder ((logf, (si, f), k), ref) - Nothing -> noop - precacher config oreader lfeeder lcloser - Nothing -> liftIO $ void lcloser - - feedmatches matcher ofeeder si f sha = - whenM (matcher $ MatchingFile $ FileInfo f f) $ - liftIO $ ofeeder ((si, f), sha) - - process matcher ofeeder mdfeeder mdcloser seenpointer ((si, (f, sha, mode)):rest) = + process mi ofeeder mdfeeder mdcloser seenpointer ((si, (f, sha, mode)):rest) = case Git.toTreeItemType mode of Just Git.TreeSymlink -> do whenM (exists f) $ -- Once a pointer file has been seen, -- symlinks have to be sent via the - -- metadata processor too. That is slightly - -- slower, but preserves the requested - -- file order. + -- metadata processor too. That is + -- slightly slower, but preserves the + -- requested file order. if seenpointer then liftIO $ mdfeeder ((si, f), sha) - else feedmatches matcher ofeeder si f sha - process matcher ofeeder mdfeeder mdcloser seenpointer rest + else feedmatches mi ofeeder si f sha + process mi ofeeder mdfeeder mdcloser seenpointer rest Just Git.TreeSubmodule -> - process matcher ofeeder mdfeeder mdcloser seenpointer rest + process mi ofeeder mdfeeder mdcloser seenpointer rest -- Might be a pointer file, might be other -- file in git, possibly large. Avoid catting -- large files by first looking up the size. Just _ -> do whenM (exists f) $ liftIO $ mdfeeder ((si, f), sha) - process matcher ofeeder mdfeeder mdcloser True rest + process mi ofeeder mdfeeder mdcloser True rest Nothing -> - process matcher ofeeder mdfeeder mdcloser seenpointer rest + process mi ofeeder mdfeeder mdcloser seenpointer rest process _ _ _ mdcloser _ [] = liftIO $ void mdcloser -- Check if files exist, because a deleted file will still be -- listed by ls-tree, but should not be processed. exists p = isJust <$> liftIO (catchMaybeIO $ R.getSymbolicLinkStatus p) - mdprocess matcher mdreader ofeeder ocloser = liftIO mdreader >>= \case + mdprocess mi mdreader ofeeder ocloser = liftIO mdreader >>= \case Just ((si, f), Just (sha, size, _type)) | size < maxPointerSz -> do - feedmatches matcher ofeeder si f sha - mdprocess matcher mdreader ofeeder ocloser - Just _ -> mdprocess matcher mdreader ofeeder ocloser + feedmatches mi ofeeder si f sha + mdprocess mi mdreader ofeeder ocloser + Just _ -> mdprocess mi mdreader ofeeder ocloser Nothing -> liftIO $ void ocloser seekHelper :: (a -> RawFilePath) -> WarnUnmatchWhen -> ([LsFiles.Options] -> [RawFilePath] -> Git.Repo -> IO ([a], IO Bool)) -> WorkTreeItems -> Annex [(SeekInput, a)] diff --git a/Limit.hs b/Limit.hs index 8335a273a6..cb2fafdc55 100644 --- a/Limit.hs +++ b/Limit.hs @@ -95,6 +95,8 @@ limitInclude glob = Right $ MatchFiles { matchAction = const $ matchGlobFile glob , matchNeedsFileName = True , matchNeedsFileContent = False + , matchNeedsKey = False + , matchNeedsLocationLog = False } {- Add a limit to skip files that match the glob. -} @@ -106,6 +108,8 @@ limitExclude glob = Right $ MatchFiles { matchAction = const $ not <$$> matchGlobFile glob , matchNeedsFileName = True , matchNeedsFileContent = False + , matchNeedsKey = False + , matchNeedsLocationLog = False } matchGlobFile :: String -> MatchInfo -> Annex Bool @@ -145,6 +149,8 @@ matchMagic _limitname querymagic selectprovidedinfo (Just magic) glob = { matchAction = const go , matchNeedsFileName = True , matchNeedsFileContent = True + , matchNeedsKey = False + , matchNeedsLocationLog = False } where cglob = compileGlob glob CaseSensative -- memoized @@ -162,6 +168,8 @@ addUnlocked = addLimit $ Right $ MatchFiles { matchAction = const $ matchLockStatus False , matchNeedsFileName = True , matchNeedsFileContent = False + , matchNeedsKey = False + , matchNeedsLocationLog = False } addLocked :: Annex () @@ -169,6 +177,8 @@ addLocked = addLimit $ Right $ MatchFiles { matchAction = const $ matchLockStatus True , matchNeedsFileName = True , matchNeedsFileContent = False + , matchNeedsKey = False + , matchNeedsLocationLog = False } matchLockStatus :: Bool -> MatchInfo -> Annex Bool @@ -188,14 +198,16 @@ addIn s = do u <- Remote.nameToUUID name hereu <- getUUID addLimit $ if u == hereu && null date - then use inhere - else use (inuuid u) + then use True inhere + else use False (inuuid u) where (name, date) = separate (== '@') s - use a = Right $ MatchFiles + use inhere a = Right $ MatchFiles { matchAction = checkKey . a , matchNeedsFileName = False , matchNeedsFileContent = False + , matchNeedsKey = True + , matchNeedsLocationLog = not inhere } inuuid u notpresent key | null date = do @@ -224,6 +236,8 @@ limitPresent u = MatchFiles return $ maybe False (`elem` us) u , matchNeedsFileName = False , matchNeedsFileContent = False + , matchNeedsKey = True + , matchNeedsLocationLog = not (isNothing u) } {- Limit to content that is in a directory, anywhere in the repository tree -} @@ -232,6 +246,8 @@ limitInDir dir = MatchFiles { matchAction = const go , matchNeedsFileName = True , matchNeedsFileContent = False + , matchNeedsKey = False + , matchNeedsLocationLog = False } where go (MatchingFile fi) = checkf $ fromRawFilePath $ matchFile fi @@ -262,6 +278,8 @@ limitCopies want = case splitc ':' want of go' n good notpresent , matchNeedsFileName = False , matchNeedsFileContent = False + , matchNeedsKey = True + , matchNeedsLocationLog = True } go' n good notpresent key = do us <- filter (`S.notMember` notpresent) @@ -284,6 +302,8 @@ limitLackingCopies approx want = case readish want of go mi needed notpresent , matchNeedsFileName = False , matchNeedsFileContent = False + , matchNeedsKey = True + , matchNeedsLocationLog = True } Nothing -> Left "bad value for number of lacking copies" where @@ -310,6 +330,8 @@ limitUnused = MatchFiles { matchAction = go , matchNeedsFileName = True , matchNeedsFileContent = False + , matchNeedsKey = True + , matchNeedsLocationLog = False } where go _ (MatchingFile _) = return False @@ -324,6 +346,8 @@ limitAnything = MatchFiles { matchAction = \_ _ -> return True , matchNeedsFileName = False , matchNeedsFileContent = False + , matchNeedsKey = False + , matchNeedsLocationLog = False } {- Limit that never matches. -} @@ -332,6 +356,8 @@ limitNothing = MatchFiles { matchAction = \_ _ -> return False , matchNeedsFileName = False , matchNeedsFileContent = False + , matchNeedsKey = False + , matchNeedsLocationLog = False } {- Adds a limit to skip files not believed to be present in all @@ -352,6 +378,8 @@ limitInAllGroup getgroupmap groupname = Right $ MatchFiles else checkKey (check want) mi , matchNeedsFileName = False , matchNeedsFileContent = False + , matchNeedsKey = True + , matchNeedsLocationLog = True } where check want key = do @@ -367,6 +395,8 @@ limitInBackend name = Right $ MatchFiles { matchAction = const $ checkKey check , matchNeedsFileName = False , matchNeedsFileContent = False + , matchNeedsKey = True + , matchNeedsLocationLog = False } where check key = pure $ fromKey keyVariety key == variety @@ -381,6 +411,8 @@ limitSecureHash = MatchFiles { matchAction = const $ checkKey isCryptographicallySecure , matchNeedsFileName = False , matchNeedsFileContent = False + , matchNeedsKey = True + , matchNeedsLocationLog = False } {- Adds a limit to skip files that are too large or too small -} @@ -399,6 +431,8 @@ limitSize lb vs s = case readSize dataUnits s of LimitAnnexFiles -> False LimitDiskFiles -> True , matchNeedsFileContent = False + , matchNeedsKey = False + , matchNeedsLocationLog = False } where go sz _ (MatchingFile fi) = case lb of @@ -425,6 +459,8 @@ limitMetaData s = case parseMetaDataMatcher s of { matchAction = const $ checkKey (check f matching) , matchNeedsFileName = False , matchNeedsFileContent = False + , matchNeedsKey = True + , matchNeedsLocationLog = False } where check f matching k = not . S.null @@ -446,6 +482,8 @@ addTimeLimit duration = do else return True , matchNeedsFileName = False , matchNeedsFileContent = False + , matchNeedsKey = False + , matchNeedsLocationLog = False } addAccessedWithin :: Duration -> Annex () @@ -455,6 +493,8 @@ addAccessedWithin duration = do { matchAction = const $ checkKey $ check now , matchNeedsFileName = False , matchNeedsFileContent = False + , matchNeedsKey = False + , matchNeedsLocationLog = False } where check now k = inAnnexCheck k $ \f -> diff --git a/Limit/Wanted.hs b/Limit/Wanted.hs index 552f0c2e5d..2a38a25562 100644 --- a/Limit/Wanted.hs +++ b/Limit/Wanted.hs @@ -1,6 +1,6 @@ {- git-annex limits by wanted status - - - Copyright 2012 Joey Hess + - Copyright 2012-2020 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -11,20 +11,27 @@ import Annex.Common import Annex.Wanted import Limit import Types.FileMatcher +import Logs.PreferredContent addWantGet :: Annex () -addWantGet = addLimit $ Right $ MatchFiles - { matchAction = const $ checkWant $ wantGet False Nothing - , matchNeedsFileName = False - , matchNeedsFileContent = False - } +addWantGet = addPreferredContentLimit $ + checkWant $ wantGet False Nothing addWantDrop :: Annex () -addWantDrop = addLimit $ Right $ MatchFiles - { matchAction = const $ checkWant $ wantDrop False Nothing Nothing - , matchNeedsFileName = False - , matchNeedsFileContent = False - } +addWantDrop = addPreferredContentLimit $ + checkWant $ wantDrop False Nothing Nothing + +addPreferredContentLimit :: (MatchInfo -> Annex Bool) -> Annex () +addPreferredContentLimit a = do + nfn <- introspectPreferredRequiredContent matchNeedsFileName Nothing + nfc <- introspectPreferredRequiredContent matchNeedsFileContent Nothing + nk <- introspectPreferredRequiredContent matchNeedsKey Nothing + addLimit $ Right $ MatchFiles + { matchAction = const a + , matchNeedsFileName = nfn + , matchNeedsFileContent = nfc + , matchNeedsKey = nk + } checkWant :: (AssociatedFile -> Annex Bool) -> MatchInfo -> Annex Bool checkWant a (MatchingFile fi) = a (AssociatedFile (Just $ matchFile fi)) diff --git a/Types/FileMatcher.hs b/Types/FileMatcher.hs index 8a56a67ac7..5903d36f7e 100644 --- a/Types/FileMatcher.hs +++ b/Types/FileMatcher.hs @@ -62,6 +62,10 @@ data MatchFiles a = MatchFiles , matchNeedsFileContent :: Bool -- ^ does the matchAction need the file content to be present in -- order to succeed? + , matchNeedsKey :: Bool + -- ^ does the matchAction look at information about the key? + , matchNeedsLocationLog :: Bool + -- ^ does the matchAction look at the location log? } type FileMatcher a = Matcher (MatchFiles a) diff --git a/doc/todo/faster_key_lookup_for_limits.mdwn b/doc/todo/faster_key_lookup_for_limits.mdwn index fd92c624e3..f6f9e46778 100644 --- a/doc/todo/faster_key_lookup_for_limits.mdwn +++ b/doc/todo/faster_key_lookup_for_limits.mdwn @@ -11,3 +11,5 @@ log for limits that need that), otherwise before getting the key. > So this needs a way to introspect a limit to see if the terms used in it > match some criteria. Another todo that also needs that is > [[sync_fast_import]] --[[Joey]] + +[[done]] --[[Joey]]