slower but sequential filtering of large files from pointer files

There should still be a speedup seeking over pointer files, just not as
large as the one seeking over symlinks.
This commit is contained in:
Joey Hess 2020-07-10 15:18:42 -04:00
parent 0f6b1ee048
commit b4d0f6dfc2
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
2 changed files with 15 additions and 24 deletions

View file

@ -29,6 +29,7 @@ git-annex (8.20200618) UNRELEASED; urgency=medium
after getting several thousand files.
* Sped up the --all option by 2x to 16x by using git cat-file --buffer.
Thanks to Lukey for finding this optimisation.
* Sped up seeking for annexed files to operate on by a factor of nearly 2x.
* fsck: Detect if WORM keys contain a carriage return, and recommend
upgrading the key. (git-annex could have maybe created such keys back
in 2013).

View file

@ -30,7 +30,7 @@ import Logs.Transfer
import Remote.List
import qualified Remote
import Annex.CatFile
import Git.CatFile
import Git.CatFile (catObjectStreamLsTree, catObjectStream)
import Annex.CurrentBranch
import Annex.Content
import Annex.Link
@ -270,21 +270,16 @@ seekFilteredKeys :: (RawFilePath -> Key -> CommandSeek) -> Annex [(RawFilePath,
seekFilteredKeys a fs = do
g <- Annex.gitRepo
matcher <- Limit.getMatcher
catObjectMetaDataStream g $ \mdfeeder mdcloser mdreader ->
catObjectStream g $ \feeder closer reader -> do
processertid <- liftIO . async =<< forkState
(gofeed matcher feeder closer mdfeeder mdcloser)
mdprocessertid <- liftIO . async =<< forkState
(mdprocess matcher mdreader feeder)
goread reader
join (liftIO (wait mdprocessertid))
join (liftIO (wait processertid))
catObjectStream g $ \feeder closer reader -> do
processertid <- liftIO . async =<< forkState
(gofeed matcher feeder closer)
goread reader
join (liftIO (wait processertid))
where
gofeed matcher feeder closer mdfeeder mdcloser = do
gofeed matcher feeder closer = do
l <- fs
forM_ l $ process matcher feeder mdfeeder
liftIO $ void closer
liftIO $ void mdcloser
forM_ l $ process matcher feeder
liftIO closer
goread reader = liftIO reader >>= \case
Just (f, content) -> do
@ -296,7 +291,7 @@ seekFilteredKeys a fs = do
whenM (matcher $ MatchingFile $ FileInfo f f) $
liftIO $ feeder (f, sha)
process matcher feeder mdfeeder (f, sha, mode) = case
process matcher feeder (f, sha, mode) = case
Git.toTreeItemType mode of
Just Git.TreeSymlink ->
feedmatches matcher feeder f sha
@ -304,17 +299,12 @@ seekFilteredKeys a fs = do
-- Might be a pointer file, might be other
-- file in git, possibly large. Avoid catting
-- large files by first looking up the size.
Just _ -> liftIO $ mdfeeder (f, sha)
Just _ -> catObjectMetaData sha >>= \case
Just (_, sz, _) | sz <= maxPointerSz ->
feedmatches matcher feeder f sha
_ -> return ()
Nothing -> return ()
mdprocess matcher mdreader feeder = liftIO mdreader >>= \case
Just (f, Just (sha, size, _type))
| size < maxPointerSz -> do
feedmatches matcher feeder f sha
mdprocess matcher mdreader feeder
Just _ -> mdprocess matcher mdreader feeder
Nothing -> return ()
seekHelper :: (a -> RawFilePath) -> WarnUnmatchWhen -> ([LsFiles.Options] -> [RawFilePath] -> Git.Repo -> IO ([a], IO Bool)) -> [WorkTreeItem] -> Annex [a]
seekHelper c ww a l = do
os <- seekOptions ww