avoid double work in git-annex init

reconcileStaged was doing a redundant scan to scannAnnexedFiles. It would probably make sense to move the body of scannAnnexedFiles into reconcileStaged, the separation does not really serve any purpose. Sponsored-by: Dartmouth College's Datalad project
2021-06-07 16:50:14 -04:00 · 2021-06-07 16:50:14 -04:00 · 0f10f208a7
commit 0f10f208a7
parent 6ceb31a30a
5 changed files with 93 additions and 38 deletions
--- a/Annex/WorkTree.hs
+++ b/Annex/WorkTree.hs
@ -69,28 +69,30 @@ ifAnnexed :: RawFilePath -> (Key -> Annex a) -> Annex a -> Annex a
 ifAnnexed file yes no = maybe no yes =<< lookupKey file
 {- Find all annexed files and update the keys database for them.
 - 
 - This is expensive, and so normally the associated files are updated
 - incrementally when changes are noticed. So, this only needs to be done
 - when initializing/upgrading a repository.
 - 
 - Also, the content for an unlocked file may already be present as
 - an annex object. If so, populate the pointer file with it.
 - But if worktree file does not have a pointer file's content, it is left
 - as-is.
 -
 - Normally the keys database is updated incrementally when changes are
 - noticed. For an initial scan, this is faster than that incremental
 - update.
 -}
 scanAnnexedFiles :: Annex ()
-scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ do
+scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $
-	g <- Annex.gitRepo
+	Database.Keys.runWriter' (Just reconciler) (const noop)
 	Database.Keys.runWriter $
 		liftIO . Database.Keys.SQL.dropAllAssociatedFiles
 	(l, cleanup) <- inRepo $ Git.LsTree.lsTree
 		Git.LsTree.LsTreeRecursive
 		(Git.LsTree.LsTreeLong True)
 		Git.Ref.headRef
 	catObjectStreamLsTree l want g go
 	liftIO $ void cleanup
  where
 	reconciler dbh = do
 		g <- Annex.gitRepo
 		liftIO $ Database.Keys.SQL.dropAllAssociatedFiles dbh
 		(l, cleanup) <- inRepo $ Git.LsTree.lsTree
 			Git.LsTree.LsTreeRecursive
 			(Git.LsTree.LsTreeLong True)
 			Git.Ref.headRef
 		catObjectStreamLsTree l want g (go dbh)
 		liftIO $ void cleanup
 	-- Want to process symlinks, and regular files.
 	want i = case Git.Types.toTreeItemType (Git.LsTree.mode i) of
 		Just Git.Types.TreeSymlink -> Just (i, False)
@ -103,17 +105,16 @@ scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ d
 		Just n | n < maxPointerSz -> Just (i, True)
 		_ -> Nothing
-	go getnext = liftIO getnext >>= \case
+	go dbh getnext = liftIO getnext >>= \case
 		Just ((i, isregfile), Just c) -> do
-			maybe noop (add i isregfile)
+			maybe noop (add i isregfile dbh)
 				(parseLinkTargetOrPointer (L.toStrict c))
-			go getnext
+			go dbh getnext
 		_ -> return ()
-	add i isregfile k = do
+	add i isregfile dbh k = do
 		let tf = Git.LsTree.file i
-		Database.Keys.runWriter $
+		liftIO $ Database.Keys.SQL.addAssociatedFileFast k tf dbh
 			liftIO . Database.Keys.SQL.addAssociatedFileFast k tf
 		whenM (pure isregfile <&&> inAnnex k) $ do
 			f <- fromRepo $ fromTopFilePath tf
 			liftIO (isPointerFile f) >>= \case
--- a/Database/Keys.hs
+++ b/Database/Keys.hs
@ -23,6 +23,7 @@ module Database.Keys (
 	removeInodeCache,
 	isInodeKnown,
 	runWriter,
 	runWriter',
 ) where
 import qualified Database.Keys.SQL as SQL
@ -73,7 +74,7 @@ runReader a = do
 		v <- a (SQL.ReadHandle qh)
 		return (v, st)
 	go DbClosed = do
-		st' <- openDb True DbClosed
+		st' <- openDb True Nothing DbClosed
 		v <- case st' of
 			(DbOpen qh) -> a (SQL.ReadHandle qh)
 			_ -> return mempty
@ -87,7 +88,15 @@ runReaderIO a = runReader (liftIO . a)
 -
 - The database is created if it doesn't exist yet. -}
 runWriter :: (SQL.WriteHandle -> Annex ()) -> Annex ()
-runWriter a = do
+runWriter = runWriter' Nothing
 runWriterIO :: (SQL.WriteHandle -> IO ()) -> Annex ()
 runWriterIO a = runWriter (liftIO . a)
 {- When a reconcile action is passed, it is run by reconcileStaged instead
 - of its usual scan, and must update the database in the same way. -}
 runWriter' :: Maybe (SQL.WriteHandle -> Annex ()) -> (SQL.WriteHandle -> Annex ()) -> Annex ()
 runWriter' reconciler a = do
 	h <- Annex.getRead Annex.keysdbhandle
 	withDbState h go
  where
@ -95,15 +104,12 @@ runWriter a = do
 		v <- a (SQL.WriteHandle qh)
 		return (v, st)
 	go st = do
-		st' <- openDb False st
+		st' <- openDb False reconciler st
 		v <- case st' of
 			DbOpen qh -> a (SQL.WriteHandle qh)
 			_ -> error "internal"
 		return (v, st')
 runWriterIO :: (SQL.WriteHandle -> IO ()) -> Annex ()
 runWriterIO a = runWriter (liftIO . a)
 {- Opens the database, creating it if it doesn't exist yet.
 -
 - Multiple readers and writers can have the database open at the same
@ -112,10 +118,10 @@ runWriterIO a = runWriter (liftIO . a)
 - the database doesn't exist yet, one caller wins the lock and
 - can create it undisturbed.
 -}
-openDb :: Bool -> DbState -> Annex DbState
+openDb :: Bool -> (Maybe (SQL.WriteHandle -> Annex ())) -> DbState -> Annex DbState
-openDb _ st@(DbOpen _) = return st
+openDb _ _ st@(DbOpen _) = return st
-openDb False DbUnavailable = return DbUnavailable
+openDb False _ DbUnavailable = return DbUnavailable
-openDb forwrite _ = catchPermissionDenied permerr $ withExclusiveLock gitAnnexKeysDbLock $ do
+openDb forwrite reconciler _ = catchPermissionDenied permerr $ withExclusiveLock gitAnnexKeysDbLock $ do
 	dbdir <- fromRepo gitAnnexKeysDb
 	let db = dbdir P.</> "db"
 	dbexists <- liftIO $ R.doesPathExist db
@ -133,7 +139,7 @@ openDb forwrite _ = catchPermissionDenied permerr $ withExclusiveLock gitAnnexKe
 	open db = do
 		qh <- liftIO $ H.openDbQueue H.MultiWriter db SQL.containedTable
-		reconcileStaged qh
+		reconcileStaged qh reconciler
 		return $ DbOpen qh
 {- Closes the database if it was open. Any writes will be flushed to it.
@ -223,8 +229,8 @@ isInodeKnown i s = or <$> runReaderIO ((:[]) <$$> SQL.isInodeKnown i s)
 - So when using getAssociatedFiles, have to make sure the file still
 - is an associated file.
 -}
-reconcileStaged :: H.DbQueue -> Annex ()
+reconcileStaged :: H.DbQueue -> Maybe (SQL.WriteHandle -> Annex ()) -> Annex ()
-reconcileStaged qh = do
+reconcileStaged qh mreconciler = do
 	gitindex <- inRepo currentIndexFile
 	indexcache <- fromRawFilePath <$> fromRepo gitAnnexKeysDbIndexCache
 	withTSDelta (liftIO . genInodeCache gitindex) >>= \case
@ -246,12 +252,21 @@ reconcileStaged qh = do
 	go cur indexcache (Just newtree) = do
 		oldtree <- getoldtree
 		when (oldtree /= newtree) $ do
 			case mreconciler of
 				Just reconciler ->
 					reconciler (SQL.WriteHandle qh)
 				Nothing -> noop
 			g <- Annex.gitRepo
-			void $ catstream $ \mdfeeder -> 
+			void $ catstream $ \mdfeeder -> do
-				void $ updatetodiff g
+				case mreconciler of
-					(Just (fromRef oldtree)) 
+					Nothing -> void $ updatetodiff g
-					(fromRef newtree)
+						(Just (fromRef oldtree)) 
-					(procdiff mdfeeder)
+						(fromRef newtree)
 						(procdiff mdfeeder)
 					Just _ -> void $ updatetodiff g
 						Nothing "--staged"
 						(procdiff mdfeeder)
 			liftIO $ writeFile indexcache $ showInodeCache cur
 			-- Storing the tree in a ref makes sure it does not
 			-- get garbage collected, and is available to diff
--- a/doc/bugs/significant_performance_regression_impacting_datal/comment_4_85d1031d2b51c0fc1271c283d8ee7888._comment
+++ b/doc/bugs/significant_performance_regression_impacting_datal/comment_4_85d1031d2b51c0fc1271c283d8ee7888._comment
@ -0,0 +1,14 @@
 [[!comment format=mdwn
 username="joey"
 subject="""comment 4"""
 date="2021-06-07T20:22:24Z"
 content="""
 Turns out `git-annex init` got a lot slower than it had to, it was doing
 the same kind of scan twice. I think that probably explains much of the
 slowdown you saw. (Although it still needs to do somewhat more work than
 before, but also does it more efficiently than before.)
 Also I've optimised the update scan's use of git, around a 20% speedup,
 although I don't know if your test case speed was impacted much by that
 scan anyway.
 """]]
--- a/doc/bugs/significant_performance_regression_impacting_datal/comment_5_b0f1e94e96cf8fe992425cf81f8d1105._comment
+++ b/doc/bugs/significant_performance_regression_impacting_datal/comment_5_b0f1e94e96cf8fe992425cf81f8d1105._comment
@ -0,0 +1,9 @@
 [[!comment format=mdwn
 username="joey"
 subject="""comment 5"""
 date="2021-06-07T20:42:34Z"
 content="""
 Looking at the CI log, I don't see any past runs that took 1h 46min.
 A month ago they were taking 2h 6min. Let's see if the changes I'm pushing
 now drop it back to that.
 """]]
--- a/doc/todo/Avoid_lengthy_34Scanning_for_unlocked_files_...34/comment_20_e9a36e9600561201969c4d21499833af._comment
+++ b/doc/todo/Avoid_lengthy_34Scanning_for_unlocked_files_...34/comment_20_e9a36e9600561201969c4d21499833af._comment
@ -0,0 +1,16 @@
 [[!comment format=mdwn
 username="joey"
 subject="""comment 20"""
 date="2021-06-07T19:22:03Z"
 content="""
 Turns out git-annex init was running both scanAnnexedFiles and
 reconcileStaged, which after recent changes to the latter, both do
 close to the same scan when run in a fresh clone. So double work!
 Benchmarking with 100,000 files, git-annex init took 88 seconds.  
 Fixed not to use reconcileStaged it took 37 seconds.
 (Keeping reconcileStaged and removing scanAnnexedFiles it took 47 seconds.
 That makes sense; reconcileStaged is an incremental updater and is not
 able to use SQL as efficiently as scanAnnexedFiles.)
 """]]