speed up initial scanning for annexed files

Streaming through git this way speeds it up by around 25%. This is similar to the optimisations of seeking annexed files. Sponsored-by: Dartmouth College's Datalad project
2021-05-31 13:40:42 -04:00 · 2021-05-31 13:40:42 -04:00 · 0f54e5e0ae
commit 0f54e5e0ae
parent aa00e171cb
2 changed files with 46 additions and 18 deletions
--- a/Annex/WorkTree.hs
+++ b/Annex/WorkTree.hs
@ -1,6 +1,6 @@
 {- git-annex worktree files
 -
- - Copyright 2013-2020 Joey Hess <id@joeyh.name>
+ - Copyright 2013-2021 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU AGPL version 3 or higher.
 -}
@ -8,6 +8,7 @@
 module Annex.WorkTree where
 import Annex.Common
 import qualified Annex
 import Annex.Link
 import Annex.CatFile
 import Annex.Content
@ -16,6 +17,7 @@ import Annex.CurrentBranch
 import Annex.InodeSentinal
 import Utility.InodeCache
 import Git.FilePath
 import Git.CatFile
 import qualified Git.Ref
 import qualified Git.LsTree
 import qualified Git.Types
@ -24,7 +26,7 @@ import qualified Database.Keys.SQL
 import Config
 import qualified Utility.RawFilePath as R
-import Control.Concurrent
+import qualified Data.ByteString.Lazy as L
 {- Looks up the key corresponding to an annexed file in the work tree,
 - by examining what the file links to.
@ -79,30 +81,40 @@ ifAnnexed file yes no = maybe no yes =<< lookupKey file
 -}
 scanAnnexedFiles :: Annex ()
 scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ do
-	dropold <- liftIO $ newMVar $ 
+	g <- Annex.gitRepo
-		Database.Keys.runWriter $
+	Database.Keys.runWriter $
-			liftIO . Database.Keys.SQL.dropAllAssociatedFiles
+		liftIO . Database.Keys.SQL.dropAllAssociatedFiles
 	(l, cleanup) <- inRepo $ Git.LsTree.lsTree
 		Git.LsTree.LsTreeRecursive
-		(Git.LsTree.LsTreeLong False)
+		(Git.LsTree.LsTreeLong True)
 		Git.Ref.headRef
-	forM_ l $ \i -> 
+	catObjectStreamLsTree l want g go
 		maybe noop (add dropold i)
 			=<< catKey'
 				(Git.LsTree.sha i)
 				(fromMaybe 0 (Git.LsTree.size i))				
 	liftIO $ void cleanup
  where
-	isregfile i = case Git.Types.toTreeItemType (Git.LsTree.mode i) of
+	-- Want to process symlinks, and regular files.
-		Just Git.Types.TreeFile -> True
+	want i = case Git.Types.toTreeItemType (Git.LsTree.mode i) of
-		Just Git.Types.TreeExecutable -> True
+		Just Git.Types.TreeSymlink -> Just (i, False)
-		_ -> False
+		Just Git.Types.TreeFile -> checkfilesize i
-	add dropold i k = do
+		Just Git.Types.TreeExecutable -> checkfilesize i
-		join $ fromMaybe noop <$> liftIO (tryTakeMVar dropold)
+		_ -> Nothing
 	-- Avoid processing files that are too large to be pointer files.
 	checkfilesize i = case Git.LsTree.size i of
 		Just n | n < maxPointerSz -> Just (i, True)
 		_ -> Nothing
 	go getnext = liftIO getnext >>= \case
 		Just ((i, isregfile), Just c) -> do
 			maybe noop (add i isregfile)
 				(parseLinkTargetOrPointer (L.toStrict c))
 			go getnext
 		_ -> return ()
 	add i isregfile k = do
 		let tf = Git.LsTree.file i
 		Database.Keys.runWriter $
 			liftIO . Database.Keys.SQL.addAssociatedFileFast k tf
-		whenM (pure (isregfile i) <&&> inAnnex k) $ do
+		whenM (pure isregfile <&&> inAnnex k) $ do
 			f <- fromRepo $ fromTopFilePath tf
 			liftIO (isPointerFile f) >>= \case
 				Just k' | k' == k -> do
--- a/doc/todo/Avoid_lengthy_34Scanning_for_unlocked_files_...34/comment_12_70c4c9f6c35acd7ca1134ac74356e5be._comment
+++ b/doc/todo/Avoid_lengthy_34Scanning_for_unlocked_files_...34/comment_12_70c4c9f6c35acd7ca1134ac74356e5be._comment
@ -0,0 +1,16 @@
 [[!comment format=mdwn
 username="joey"
 subject="""comment 12"""
 date="2021-05-31T16:30:11Z"
 content="""
 Implemented streaming through git. In a repo with 100000 unlocked files,
 version 8.20210429 took 46 seconds, now reduced to 36 seconds.
 When the files are locked, of course the old version was faster
 due to being able to skip all symlinks, 2 seconds. The new version takes
 slightly less time than it does for unlocked files, 35 seconds.
 Now the git query and processing is only a few seconds of the total run time,
 writing information about all the files to sqlite is most of the rest,
 and may also be possible to speed up.
 """]]