speed up initial scanning for annexed files

Streaming through git this way speeds it up by around 25%. This is
similar to the optimisations of seeking annexed files.

Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
Joey Hess 2021-05-31 13:40:42 -04:00
parent aa00e171cb
commit 0f54e5e0ae
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
2 changed files with 46 additions and 18 deletions

View file

@ -1,6 +1,6 @@
{- git-annex worktree files {- git-annex worktree files
- -
- Copyright 2013-2020 Joey Hess <id@joeyh.name> - Copyright 2013-2021 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -8,6 +8,7 @@
module Annex.WorkTree where module Annex.WorkTree where
import Annex.Common import Annex.Common
import qualified Annex
import Annex.Link import Annex.Link
import Annex.CatFile import Annex.CatFile
import Annex.Content import Annex.Content
@ -16,6 +17,7 @@ import Annex.CurrentBranch
import Annex.InodeSentinal import Annex.InodeSentinal
import Utility.InodeCache import Utility.InodeCache
import Git.FilePath import Git.FilePath
import Git.CatFile
import qualified Git.Ref import qualified Git.Ref
import qualified Git.LsTree import qualified Git.LsTree
import qualified Git.Types import qualified Git.Types
@ -24,7 +26,7 @@ import qualified Database.Keys.SQL
import Config import Config
import qualified Utility.RawFilePath as R import qualified Utility.RawFilePath as R
import Control.Concurrent import qualified Data.ByteString.Lazy as L
{- Looks up the key corresponding to an annexed file in the work tree, {- Looks up the key corresponding to an annexed file in the work tree,
- by examining what the file links to. - by examining what the file links to.
@ -79,30 +81,40 @@ ifAnnexed file yes no = maybe no yes =<< lookupKey file
-} -}
scanAnnexedFiles :: Annex () scanAnnexedFiles :: Annex ()
scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ do scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ do
dropold <- liftIO $ newMVar $ g <- Annex.gitRepo
Database.Keys.runWriter $ Database.Keys.runWriter $
liftIO . Database.Keys.SQL.dropAllAssociatedFiles liftIO . Database.Keys.SQL.dropAllAssociatedFiles
(l, cleanup) <- inRepo $ Git.LsTree.lsTree (l, cleanup) <- inRepo $ Git.LsTree.lsTree
Git.LsTree.LsTreeRecursive Git.LsTree.LsTreeRecursive
(Git.LsTree.LsTreeLong False) (Git.LsTree.LsTreeLong True)
Git.Ref.headRef Git.Ref.headRef
forM_ l $ \i -> catObjectStreamLsTree l want g go
maybe noop (add dropold i)
=<< catKey'
(Git.LsTree.sha i)
(fromMaybe 0 (Git.LsTree.size i))
liftIO $ void cleanup liftIO $ void cleanup
where where
isregfile i = case Git.Types.toTreeItemType (Git.LsTree.mode i) of -- Want to process symlinks, and regular files.
Just Git.Types.TreeFile -> True want i = case Git.Types.toTreeItemType (Git.LsTree.mode i) of
Just Git.Types.TreeExecutable -> True Just Git.Types.TreeSymlink -> Just (i, False)
_ -> False Just Git.Types.TreeFile -> checkfilesize i
add dropold i k = do Just Git.Types.TreeExecutable -> checkfilesize i
join $ fromMaybe noop <$> liftIO (tryTakeMVar dropold) _ -> Nothing
-- Avoid processing files that are too large to be pointer files.
checkfilesize i = case Git.LsTree.size i of
Just n | n < maxPointerSz -> Just (i, True)
_ -> Nothing
go getnext = liftIO getnext >>= \case
Just ((i, isregfile), Just c) -> do
maybe noop (add i isregfile)
(parseLinkTargetOrPointer (L.toStrict c))
go getnext
_ -> return ()
add i isregfile k = do
let tf = Git.LsTree.file i let tf = Git.LsTree.file i
Database.Keys.runWriter $ Database.Keys.runWriter $
liftIO . Database.Keys.SQL.addAssociatedFileFast k tf liftIO . Database.Keys.SQL.addAssociatedFileFast k tf
whenM (pure (isregfile i) <&&> inAnnex k) $ do whenM (pure isregfile <&&> inAnnex k) $ do
f <- fromRepo $ fromTopFilePath tf f <- fromRepo $ fromTopFilePath tf
liftIO (isPointerFile f) >>= \case liftIO (isPointerFile f) >>= \case
Just k' | k' == k -> do Just k' | k' == k -> do

View file

@ -0,0 +1,16 @@
[[!comment format=mdwn
username="joey"
subject="""comment 12"""
date="2021-05-31T16:30:11Z"
content="""
Implemented streaming through git. In a repo with 100000 unlocked files,
version 8.20210429 took 46 seconds, now reduced to 36 seconds.
When the files are locked, of course the old version was faster
due to being able to skip all symlinks, 2 seconds. The new version takes
slightly less time than it does for unlocked files, 35 seconds.
Now the git query and processing is only a few seconds of the total run time,
writing information about all the files to sqlite is most of the rest,
and may also be possible to speed up.
"""]]