slower but sequential filtering of large files from pointer files

There should still be a speedup seeking over pointer files, just not as
large as the one seeking over symlinks.
This commit is contained in:
Joey Hess 2020-07-10 15:18:42 -04:00
parent 0f6b1ee048
commit b4d0f6dfc2
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
2 changed files with 15 additions and 24 deletions

View file

@ -29,6 +29,7 @@ git-annex (8.20200618) UNRELEASED; urgency=medium
after getting several thousand files. after getting several thousand files.
* Sped up the --all option by 2x to 16x by using git cat-file --buffer. * Sped up the --all option by 2x to 16x by using git cat-file --buffer.
Thanks to Lukey for finding this optimisation. Thanks to Lukey for finding this optimisation.
* Sped up seeking for annexed files to operate on by a factor of nearly 2x.
* fsck: Detect if WORM keys contain a carriage return, and recommend * fsck: Detect if WORM keys contain a carriage return, and recommend
upgrading the key. (git-annex could have maybe created such keys back upgrading the key. (git-annex could have maybe created such keys back
in 2013). in 2013).

View file

@ -30,7 +30,7 @@ import Logs.Transfer
import Remote.List import Remote.List
import qualified Remote import qualified Remote
import Annex.CatFile import Annex.CatFile
import Git.CatFile import Git.CatFile (catObjectStreamLsTree, catObjectStream)
import Annex.CurrentBranch import Annex.CurrentBranch
import Annex.Content import Annex.Content
import Annex.Link import Annex.Link
@ -270,21 +270,16 @@ seekFilteredKeys :: (RawFilePath -> Key -> CommandSeek) -> Annex [(RawFilePath,
seekFilteredKeys a fs = do seekFilteredKeys a fs = do
g <- Annex.gitRepo g <- Annex.gitRepo
matcher <- Limit.getMatcher matcher <- Limit.getMatcher
catObjectMetaDataStream g $ \mdfeeder mdcloser mdreader -> catObjectStream g $ \feeder closer reader -> do
catObjectStream g $ \feeder closer reader -> do processertid <- liftIO . async =<< forkState
processertid <- liftIO . async =<< forkState (gofeed matcher feeder closer)
(gofeed matcher feeder closer mdfeeder mdcloser) goread reader
mdprocessertid <- liftIO . async =<< forkState join (liftIO (wait processertid))
(mdprocess matcher mdreader feeder)
goread reader
join (liftIO (wait mdprocessertid))
join (liftIO (wait processertid))
where where
gofeed matcher feeder closer mdfeeder mdcloser = do gofeed matcher feeder closer = do
l <- fs l <- fs
forM_ l $ process matcher feeder mdfeeder forM_ l $ process matcher feeder
liftIO $ void closer liftIO closer
liftIO $ void mdcloser
goread reader = liftIO reader >>= \case goread reader = liftIO reader >>= \case
Just (f, content) -> do Just (f, content) -> do
@ -296,7 +291,7 @@ seekFilteredKeys a fs = do
whenM (matcher $ MatchingFile $ FileInfo f f) $ whenM (matcher $ MatchingFile $ FileInfo f f) $
liftIO $ feeder (f, sha) liftIO $ feeder (f, sha)
process matcher feeder mdfeeder (f, sha, mode) = case process matcher feeder (f, sha, mode) = case
Git.toTreeItemType mode of Git.toTreeItemType mode of
Just Git.TreeSymlink -> Just Git.TreeSymlink ->
feedmatches matcher feeder f sha feedmatches matcher feeder f sha
@ -304,17 +299,12 @@ seekFilteredKeys a fs = do
-- Might be a pointer file, might be other -- Might be a pointer file, might be other
-- file in git, possibly large. Avoid catting -- file in git, possibly large. Avoid catting
-- large files by first looking up the size. -- large files by first looking up the size.
Just _ -> liftIO $ mdfeeder (f, sha) Just _ -> catObjectMetaData sha >>= \case
Just (_, sz, _) | sz <= maxPointerSz ->
feedmatches matcher feeder f sha
_ -> return ()
Nothing -> return () Nothing -> return ()
mdprocess matcher mdreader feeder = liftIO mdreader >>= \case
Just (f, Just (sha, size, _type))
| size < maxPointerSz -> do
feedmatches matcher feeder f sha
mdprocess matcher mdreader feeder
Just _ -> mdprocess matcher mdreader feeder
Nothing -> return ()
seekHelper :: (a -> RawFilePath) -> WarnUnmatchWhen -> ([LsFiles.Options] -> [RawFilePath] -> Git.Repo -> IO ([a], IO Bool)) -> [WorkTreeItem] -> Annex [a] seekHelper :: (a -> RawFilePath) -> WarnUnmatchWhen -> ([LsFiles.Options] -> [RawFilePath] -> Git.Repo -> IO ([a], IO Bool)) -> [WorkTreeItem] -> Annex [a]
seekHelper c ww a l = do seekHelper c ww a l = do
os <- seekOptions ww os <- seekOptions ww