avoid double work in git-annex init, second try

reconcileStaged populates the db, so scanAnnexedFiles does not need to do it again. It still makes a pass over the HEAD tree, but populating the db was most of the expensive part. Benchmarking with 100,000 files, git-annex init now takes 40 seconds, vs 37 seconds with the old, buggy version of this fix. It should be possible to win those 3 precious seconds per 100k files back, in the case when when annex.thin is not set, with improvements to reconcileStaged that avoid needing this second pass. Sponsored-by: Dartmouth College's Datalad project
2021-06-08 09:27:53 -04:00 · 2021-06-08 09:27:53 -04:00 · c941ab6f5b
commit c941ab6f5b
parent 22185b4a4e
3 changed files with 10 additions and 19 deletions
--- a/Annex/WorkTree.hs
+++ b/Annex/WorkTree.hs
@ -22,7 +22,6 @@ import qualified Git.Ref
 import qualified Git.LsTree
 import qualified Git.Types
 import qualified Database.Keys
 import qualified Database.Keys.SQL
 import Config
 import qualified Utility.RawFilePath as R
@ -81,9 +80,15 @@ ifAnnexed file yes no = maybe no yes =<< lookupKey file
 -}
 scanAnnexedFiles :: Annex ()
 scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ do
 	-- This gets the keys database populated with all annexed files,
 	-- by running Database.Keys.reconcileStaged.
 	Database.Keys.runWriter (const noop)
 	-- The above tries to populate pointer files, but one thing it
 	-- is not able to handle is populating a pointer file when the
 	-- annex object file already exists, but its inode is not yet
 	-- cached. So, the rest of this makes another pass over the
 	-- tree to do that.
 	g <- Annex.gitRepo
 	Database.Keys.runWriter $
 		liftIO . Database.Keys.SQL.dropAllAssociatedFiles
 	(l, cleanup) <- inRepo $ Git.LsTree.lsTree
 		Git.LsTree.LsTreeRecursive
 		(Git.LsTree.LsTreeLong True)
@ -112,8 +117,6 @@ scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ d
 	add i isregfile k = do
 		let tf = Git.LsTree.file i
 		Database.Keys.runWriter $
 			liftIO . Database.Keys.SQL.addAssociatedFileFast k tf
 		whenM (pure isregfile <&&> inAnnex k) $ do
 			f <- fromRepo $ fromTopFilePath tf
 			liftIO (isPointerFile f) >>= \case
--- a/Database/Keys.hs
+++ b/Database/Keys.hs
@ -339,7 +339,7 @@ reconcileStaged qh = do
 							(asTopFilePath file)
 							(SQL.WriteHandle qh)
 						when (dstmode /= fmtTreeItemType TreeSymlink) $
-							reconcilerace (asTopFilePath file) key
+							reconcilepointerfile (asTopFilePath file) key
 						return True
 					Nothing -> return False
 				procdiff mdfeeder rest
@ -367,7 +367,7 @@ reconcileStaged qh = do
 			_ -> return conflicted -- parse failed
 	procmergeconflictdiff _ _ conflicted = return conflicted
-	reconcilerace file key = do
+	reconcilepointerfile file key = do
 		caches <- liftIO $ SQL.getInodeCaches key (SQL.ReadHandle qh)
 		keyloc <- calcRepo (gitAnnexLocation key)
 		keypopulated <- sameInodeCache keyloc caches
--- a/Database/Keys/SQL.hs
+++ b/Database/Keys/SQL.hs
@ -88,18 +88,6 @@ addAssociatedFile k f = queueDb $
  where
 	af = SFilePath (getTopFilePath f)
 -- Does not remove any old association for a file, but less expensive
 -- than addAssociatedFile. Calling dropAllAssociatedFiles first and then
 -- this is an efficient way to update all associated files.
 addAssociatedFileFast :: Key -> TopFilePath -> WriteHandle -> IO ()
 addAssociatedFileFast k f = queueDb $ void $ insertUnique $ Associated k af
  where
 	af = SFilePath (getTopFilePath f)
 dropAllAssociatedFiles :: WriteHandle -> IO ()
 dropAllAssociatedFiles = queueDb $
 	deleteWhere ([] :: [Filter Associated])
 {- Note that the files returned were once associated with the key, but
 - some of them may not be any longer. -}
 getAssociatedFiles :: Key -> ReadHandle -> IO [TopFilePath]