From d732ef1a89bfbd7c2595c2ad06326434a6d78e52 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 24 Jul 2020 12:56:02 -0400 Subject: [PATCH] move, copy: Sped up seeking for annexed files to operate on by a factor of nearly 2x. --- CHANGELOG | 2 ++ Command/Copy.hs | 7 ++-- Command/Move.hs | 32 ++++++++----------- doc/todo/faster_key_lookup_for_limits.mdwn | 9 ++++++ ...logs_for_speed_with_cat-file_--buffer.mdwn | 17 ++++------ 5 files changed, 37 insertions(+), 30 deletions(-) create mode 100644 doc/todo/faster_key_lookup_for_limits.mdwn diff --git a/CHANGELOG b/CHANGELOG index a51cf9997d..6dc53d6b57 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -8,6 +8,8 @@ git-annex (8.20200720.2) UNRELEASED; urgency=medium some weird inheriting of ssh FDs by sshd. Bug was introduced in git-annex version 7.20200202.7. * Fix a bug in find --branch in the previous version. + * move, copy: Sped up seeking for annexed files to operate on by a factor + of nearly 2x. -- Joey Hess Tue, 21 Jul 2020 12:58:30 -0400 diff --git a/Command/Copy.hs b/Command/Copy.hs index 75ede060e0..5a8b185c48 100644 --- a/Command/Copy.hs +++ b/Command/Copy.hs @@ -57,8 +57,11 @@ seek o = startConcurrency commandStages $ do seeker = AnnexedFileSeeker { startAction = start o - , checkContentPresent = Nothing - , usesLocationLog = False + , checkContentPresent = case fromToOptions o of + Right (FromRemote _) -> Just False + Right (ToRemote _) -> Just True + Left ToHere -> Just False + , usesLocationLog = True } {- A copy is just a move that does not delete the source file. diff --git a/Command/Move.hs b/Command/Move.hs index bbb4a0dc2d..ba34441330 100644 --- a/Command/Move.hs +++ b/Command/Move.hs @@ -55,11 +55,6 @@ data RemoveWhen = RemoveSafe | RemoveNever seek :: MoveOptions -> CommandSeek seek o = startConcurrency stages $ do - let seeker = AnnexedFileSeeker - { startAction = start (fromToOptions o) (removeWhen o) - , checkContentPresent = Nothing - , usesLocationLog = False - } case batchOption o of NoBatch -> withKeyOptions (keyOptions o) False seeker (commandAction . startKey (fromToOptions o) (removeWhen o)) @@ -67,6 +62,14 @@ seek o = startConcurrency stages $ do =<< workTreeItems ww (moveFiles o) Batch fmt -> batchAnnexedFilesMatching fmt seeker where + seeker = AnnexedFileSeeker + { startAction = start (fromToOptions o) (removeWhen o) + , checkContentPresent = case fromToOptions o of + Right (FromRemote _) -> Nothing + Right (ToRemote _) -> Just True + Left ToHere -> Nothing + , usesLocationLog = True + } stages = case fromToOptions o of Right (FromRemote _) -> downloadStages Right (ToRemote _) -> commandStages @@ -103,9 +106,8 @@ describeMoveAction _ = "move" toStart :: RemoveWhen -> AssociatedFile -> Key -> ActionItem -> Remote -> CommandStart toStart removewhen afile key ai dest = do u <- getUUID - ishere <- inAnnex key - if not ishere || u == Remote.uuid dest - then stop -- not here, so nothing to do + if u == Remote.uuid dest + then stop else toStart' dest removewhen afile key ai toStart' :: Remote -> RemoveWhen -> AssociatedFile -> Key -> ActionItem -> CommandStart @@ -188,11 +190,8 @@ toPerform dest removewhen key afile fastcheck isthere = return False fromStart :: RemoveWhen -> AssociatedFile -> Key -> ActionItem -> Remote -> CommandStart -fromStart removewhen afile key ai src = case removewhen of - RemoveNever -> stopUnless (not <$> inAnnex key) go - RemoveSafe -> go - where - go = stopUnless (fromOk src key) $ +fromStart removewhen afile key ai src = + stopUnless (fromOk src key) $ starting (describeMoveAction removewhen) (OnlyActionOn key ai) $ fromPerform src removewhen key afile @@ -247,11 +246,8 @@ fromPerform src removewhen key afile = do - When moving, the content is removed from all the reachable remotes that - it can safely be removed from. -} toHereStart :: RemoveWhen -> AssociatedFile -> Key -> ActionItem -> CommandStart -toHereStart removewhen afile key ai = case removewhen of - RemoveNever -> stopUnless (not <$> inAnnex key) go - RemoveSafe -> go - where - go = startingNoMessage (OnlyActionOn key ai) $ do +toHereStart removewhen afile key ai = + startingNoMessage (OnlyActionOn key ai) $ do rs <- Remote.keyPossibilities key forM_ rs $ \r -> includeCommandAction $ diff --git a/doc/todo/faster_key_lookup_for_limits.mdwn b/doc/todo/faster_key_lookup_for_limits.mdwn new file mode 100644 index 0000000000..b9cac11644 --- /dev/null +++ b/doc/todo/faster_key_lookup_for_limits.mdwn @@ -0,0 +1,9 @@ +As part of the work in [[precache_logs_for_speed_with_cat-file_--buffer]], +key lookups are now done twice as fast as before. + +But, limits that look up keys still do a key lookup, before the key +is looked up efficiently. Avoiding that would speed up --in etc, probably +another 1.5x-2x speedup when such limits are used. What that optimisation +needs is a way to tell if the current limit needs the key or not. If it +does, then match on it after getting the key (and precaching the location +log for limits that need that), otherwise before getting the key. diff --git a/doc/todo/precache_logs_for_speed_with_cat-file_--buffer.mdwn b/doc/todo/precache_logs_for_speed_with_cat-file_--buffer.mdwn index 127e7e7174..da6962b65b 100644 --- a/doc/todo/precache_logs_for_speed_with_cat-file_--buffer.mdwn +++ b/doc/todo/precache_logs_for_speed_with_cat-file_--buffer.mdwn @@ -33,10 +33,13 @@ and precache them. > > > * `sync --content` 2x speedup! > > > * `fsck --fast` 1.5x speedup > > > * `whereis` 1.5x speedup +> > > * `copy --to --fast` twenty-five percent or so speedup +> > > * `copy --to` 2x speedup +> > > * `copy --from` 2x speedup > > > -> > > Still todo: -> > > -> > > * move, copy, drop, and mirror were left not using the location log caching yet +> > > For copy benchmarks, note that both repos had all files. +> > > +> > > [[done]] Another thing that the same cat-file --buffer approach could be used with is to cat the annex links. Git.LsFiles.inRepoDetails provides the Sha @@ -52,10 +55,4 @@ Some calls to lookupKey remain, and the above could be used to remove them and make it faster. The ones in Annex.View and Command.Unused seem most likely to be able to be converted. -Also, limits that look up keys still do a key lookup, before the key is -looked up efficiently. (Before these changes, the same key lookup was done -2x too..) Avoiding that would speed up --in etc, probably another 1.5x-2x -speedup when such limits are used. What that optimisation needs is a way to -tell if the current limit needs the key or not. If it does, then match on -it after getting the key (and precaching the location log for limits that -need that), otherwise before getting the key. +See also [[faster_key_lookup_for_limits]]