sync --all avoid unncessary first pass

Sped up seeking to around twice as fast, by avoiding a pass over the
worktree files when preferred content expressions of the local repo and
remotes don't use include=/exclude=.

Thanks to Lukey for identifying the optimisation.

This commit was sponsored by Brock Spratlen on Patreon.
This commit is contained in:
Joey Hess 2020-09-24 15:12:09 -04:00
parent b45b37b088
commit d89984b121
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
9 changed files with 86 additions and 12 deletions

View file

@ -267,6 +267,7 @@ call :: Either String (FileMatcher Annex) -> ParseResult (MatchFiles Annex)
call (Right sub) = Right $ Operation $ MatchFiles
{ matchAction = \notpresent mi ->
matchMrun sub $ \o -> matchAction o notpresent mi
, matchNeedsFileName = any matchNeedsFileName sub
, matchNeedsFileContent = any matchNeedsFileContent sub
}
call (Left err) = Left err

View file

@ -18,6 +18,9 @@ git-annex (8.20200909) UNRELEASED; urgency=medium
message, with some hints for the user for what to do.
* Improve --debug output to show pid of processes that are started and
stopped.
* sync --all: Sped up seeking to around twice as fast, by avoiding a
pass over the worktree files when preferred content expressions of the
local repo and remotes don't use include=/exclude=.
-- Joey Hess <id@joeyh.name> Mon, 14 Sep 2020 18:34:37 -0400

View file

@ -57,6 +57,7 @@ import Annex.Drop
import Annex.UUID
import Logs.UUID
import Logs.Export
import Logs.PreferredContent
import Annex.AutoMerge
import Annex.AdjustedBranch
import Annex.Ssh
@ -65,6 +66,7 @@ import Annex.UpdateInstead
import Annex.Export
import Annex.TaggedPush
import Annex.CurrentBranch
import Types.FileMatcher
import qualified Database.Export as Export
import Utility.Bloom
import Utility.OptParse
@ -633,9 +635,11 @@ newer remote b = do
- (Or, when in an ajusted branch where some files are hidden, at files in
- the original branch.)
-
- With --all, makes a second pass over all keys.
- This ensures that preferred content expressions that match on
- filenames work, even when in --all mode.
- With --all, when preferred content expressions look at filenames,
- makes a first pass over the files in the work tree so those preferred
- content expressions will match. The second pass is over all keys,
- and only preferred content expressions that don't look at filenames
- will match.
-
- Returns true if any file transfers were made.
-
@ -646,7 +650,12 @@ seekSyncContent _ [] _ = return False
seekSyncContent o rs currbranch = do
mvar <- liftIO newEmptyMVar
bloom <- case keyOptions o of
Just WantAllKeys -> Just <$> genBloomFilter (seekworktree mvar (WorkTreeItems []))
Just WantAllKeys -> ifM preferredcontentmatchesfilenames
( Just <$> genBloomFilter (seekworktree mvar (WorkTreeItems []))
, do
liftIO $ print "skipped first pass"
pure Nothing
)
_ -> case currbranch of
(Just origbranch, Just adj) | adjustmentHidesFiles adj -> do
l <- workTreeItems' (AllowHidden True) ww (contentOfOption o)
@ -692,6 +701,12 @@ seekSyncContent o rs currbranch = do
void $ liftIO $ tryPutMVar mvar ()
next $ return True
preferredcontentmatchesfilenames =
preferredcontentmatchesfilenames' Nothing
<||> anyM (preferredcontentmatchesfilenames' . Just . Remote.uuid) rs
preferredcontentmatchesfilenames' =
introspectPreferredRequiredContent matchNeedsFileName
{- If it's preferred content, and we don't have it, get it from one of the
- listed remotes (preferring the cheaper earlier ones).
-
@ -717,11 +732,13 @@ syncFile ebloom rs af k = do
u <- getUUID
let locs' = concat [if inhere || got then [u] else [], putrs, locs]
-- A bloom filter is populated with all the keys in the first pass.
-- On the second pass, avoid dropping keys that were seen in the
-- first pass, which would happen otherwise when preferred content
-- matches on the filename, which is not available in the second
-- pass.
-- To handle --all, a bloom filter is populated with all the keys
-- of files in the working tree in the first pass. On the second
-- pass, avoid dropping keys that were seen in the first pass, which
-- would happen otherwise when preferred content matches on the
-- filename, which is not available in the second pass.
-- (When the preferred content expressions do not match on
-- filenames, the first pass is skipped for speed.)
--
-- When there's a false positive in the bloom filter, the result
-- is keeping a key that preferred content doesn't really want.

View file

@ -1,6 +1,6 @@
{- user-specified limits on files to act on
-
- Copyright 2011-2019 Joey Hess <id@joeyh.name>
- Copyright 2011-2020 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -88,6 +88,7 @@ addInclude = addLimit . limitInclude
limitInclude :: MkLimit Annex
limitInclude glob = Right $ MatchFiles
{ matchAction = const $ matchGlobFile glob
, matchNeedsFileName = True
, matchNeedsFileContent = False
}
@ -98,6 +99,7 @@ addExclude = addLimit . limitExclude
limitExclude :: MkLimit Annex
limitExclude glob = Right $ MatchFiles
{ matchAction = const $ not <$$> matchGlobFile glob
, matchNeedsFileName = True
, matchNeedsFileContent = False
}
@ -136,6 +138,7 @@ matchMagic :: String -> (Magic -> FilePath -> Annex (Maybe String)) -> (Provided
matchMagic _limitname querymagic selectprovidedinfo (Just magic) glob =
Right $ MatchFiles
{ matchAction = const go
, matchNeedsFileName = False
, matchNeedsFileContent = True
}
where
@ -152,12 +155,14 @@ matchMagic limitname _ _ Nothing _ =
addUnlocked :: Annex ()
addUnlocked = addLimit $ Right $ MatchFiles
{ matchAction = const $ matchLockStatus False
, matchNeedsFileName = True
, matchNeedsFileContent = False
}
addLocked :: Annex ()
addLocked = addLimit $ Right $ MatchFiles
{ matchAction = const $ matchLockStatus True
, matchNeedsFileName = True
, matchNeedsFileContent = False
}
@ -184,6 +189,7 @@ addIn s = do
(name, date) = separate (== '@') s
use a = Right $ MatchFiles
{ matchAction = checkKey . a
, matchNeedsFileName = False
, matchNeedsFileContent = False
}
inuuid u notpresent key
@ -211,6 +217,7 @@ limitPresent u = MatchFiles
else do
us <- Remote.keyLocations key
return $ maybe False (`elem` us) u
, matchNeedsFileName = False
, matchNeedsFileContent = False
}
@ -218,6 +225,7 @@ limitPresent u = MatchFiles
limitInDir :: FilePath -> MatchFiles Annex
limitInDir dir = MatchFiles
{ matchAction = const go
, matchNeedsFileName = True
, matchNeedsFileContent = False
}
where
@ -247,6 +255,7 @@ limitCopies want = case splitc ':' want of
Just n -> Right $ MatchFiles
{ matchAction = \notpresent -> checkKey $
go' n good notpresent
, matchNeedsFileName = False
, matchNeedsFileContent = False
}
go' n good notpresent key = do
@ -268,6 +277,7 @@ limitLackingCopies approx want = case readish want of
Just needed -> Right $ MatchFiles
{ matchAction = \notpresent mi -> flip checkKey mi $
go mi needed notpresent
, matchNeedsFileName = False
, matchNeedsFileContent = False
}
Nothing -> Left "bad value for number of lacking copies"
@ -293,6 +303,7 @@ limitLackingCopies approx want = case readish want of
limitUnused :: MatchFiles Annex
limitUnused = MatchFiles
{ matchAction = go
, matchNeedsFileName = False
, matchNeedsFileContent = False
}
where
@ -306,6 +317,7 @@ limitUnused = MatchFiles
limitAnything :: MatchFiles Annex
limitAnything = MatchFiles
{ matchAction = \_ _ -> return True
, matchNeedsFileName = False
, matchNeedsFileContent = False
}
@ -313,6 +325,7 @@ limitAnything = MatchFiles
limitNothing :: MatchFiles Annex
limitNothing = MatchFiles
{ matchAction = \_ _ -> return False
, matchNeedsFileName = False
, matchNeedsFileContent = False
}
@ -332,6 +345,7 @@ limitInAllGroup getgroupmap groupname = Right $ MatchFiles
else if not (S.null (S.intersection want notpresent))
then return False
else checkKey (check want) mi
, matchNeedsFileName = False
, matchNeedsFileContent = False
}
where
@ -346,6 +360,7 @@ addInBackend = addLimit . limitInBackend
limitInBackend :: MkLimit Annex
limitInBackend name = Right $ MatchFiles
{ matchAction = const $ checkKey check
, matchNeedsFileName = False
, matchNeedsFileContent = False
}
where
@ -359,6 +374,7 @@ addSecureHash = addLimit $ Right limitSecureHash
limitSecureHash :: MatchFiles Annex
limitSecureHash = MatchFiles
{ matchAction = const $ checkKey isCryptographicallySecure
, matchNeedsFileName = False
, matchNeedsFileContent = False
}
@ -374,6 +390,7 @@ limitSize lb vs s = case readSize dataUnits s of
Nothing -> Left "bad size"
Just sz -> Right $ MatchFiles
{ matchAction = go sz
, matchNeedsFileName = False
, matchNeedsFileContent = False
}
where
@ -399,6 +416,7 @@ limitMetaData s = case parseMetaDataMatcher s of
Left e -> Left e
Right (f, matching) -> Right $ MatchFiles
{ matchAction = const $ checkKey (check f matching)
, matchNeedsFileName = False
, matchNeedsFileContent = False
}
where
@ -419,6 +437,7 @@ addTimeLimit duration = do
shutdown True
liftIO $ exitWith $ ExitFailure 101
else return True
, matchNeedsFileName = False
, matchNeedsFileContent = False
}
@ -427,6 +446,7 @@ addAccessedWithin duration = do
now <- liftIO getPOSIXTime
addLimit $ Right $ MatchFiles
{ matchAction = const $ checkKey $ check now
, matchNeedsFileName = False
, matchNeedsFileContent = False
}
where

View file

@ -15,12 +15,14 @@ import Types.FileMatcher
addWantGet :: Annex ()
addWantGet = addLimit $ Right $ MatchFiles
{ matchAction = const $ checkWant $ wantGet False Nothing
, matchNeedsFileName = False
, matchNeedsFileContent = False
}
addWantDrop :: Annex ()
addWantDrop = addLimit $ Right $ MatchFiles
{ matchAction = const $ checkWant $ wantDrop False Nothing Nothing
, matchNeedsFileName = False
, matchNeedsFileContent = False
}

View file

@ -1,6 +1,6 @@
{- git-annex preferred content matcher configuration
-
- Copyright 2012-2019 Joey Hess <id@joeyh.name>
- Copyright 2012-2020 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -21,6 +21,7 @@ module Logs.PreferredContent (
defaultStandardGroup,
preferredRequiredMapsLoad,
preferredRequiredMapsLoad',
introspectPreferredRequiredContent,
prop_standardGroups_parse,
) where
@ -61,6 +62,16 @@ checkMap getmap mu notpresent mkey afile d = do
Nothing -> return d
Just matcher -> checkMatcher matcher mkey afile notpresent (return d) (return d)
{- Checks if the preferred or required content for the specified repository
- (or the current repository if none is specified) contains any terms
- that meet the condition. -}
introspectPreferredRequiredContent :: (MatchFiles Annex -> Bool) -> Maybe UUID -> Annex Bool
introspectPreferredRequiredContent c mu = do
u <- maybe getUUID return mu
check u preferredContentMap <||> check u requiredContentMap
where
check u mk = mk >>= return . maybe False (any c) . M.lookup u
preferredContentMap :: Annex (FileMatcherMap Annex)
preferredContentMap = maybe (fst <$> preferredRequiredMapsLoad preferredContentTokens) return
=<< Annex.getState Annex.preferredcontentmap

View file

@ -57,8 +57,11 @@ type AssumeNotPresent = S.Set UUID
data MatchFiles a = MatchFiles
{ matchAction :: AssumeNotPresent -> MatchInfo -> a Bool
, matchNeedsFileName :: Bool
-- ^ does the matchAction need a filename in order to match?
, matchNeedsFileContent :: Bool
-- ^ does the matchAction need the file content to be present?
-- ^ does the matchAction need the file content to be present in
-- order to succeed?
}
type FileMatcher a = Matcher (MatchFiles a)

View file

@ -19,3 +19,5 @@ and it led to a 2x speedup (with warm cache):
This repo has 25641 keys and all of them are in the worktree too.
> [[done]]! --[[Joey]]

View file

@ -0,0 +1,15 @@
[[!comment format=mdwn
username="joey"
subject="""comment 2"""
date="2020-09-24T19:04:32Z"
content="""
One side effect of this optimisation is that, while sync --all used to
tell the filenames it was getting or dropping, when operating on files
in the working tree, when the optimsation is enabled it will only
display the keys. So, its behavior in 2 different repos might seem
inconsistent to a user, who doesn't know about all these gory 2 pass details.
I think, if that became a problem, the best fix would be to only display
the keys, and never the worktree filenames, even when running the first
pass. But I'll wait and see if that needs to be done, I suppose.
"""]]