optimise reconcileStaged with git cat-file streaming

Commit 428c91606b made it need to do more work in situations like switching between very different branches. Compare with seekFilteredKeys which has a similar optimisation. Might be possible to factor out the common part from these? Sponsored-by: Dartmouth College's Datalad project
2021-06-07 14:51:38 -04:00 · 2021-06-07 14:51:38 -04:00 · 6ceb31a30a
commit 6ceb31a30a
parent 70dbe61fc2
2 changed files with 84 additions and 38 deletions
--- a/Database/Keys.hs
+++ b/Database/Keys.hs
@ -34,7 +34,6 @@ import Annex.Locations
 import Annex.Common hiding (delete)
 import qualified Annex
 import Annex.LockFile
 import Annex.CatFile
 import Annex.Content.PointerFile
 import Annex.Link
 import Utility.InodeCache
@ -45,6 +44,7 @@ import Git.Command
 import Git.Types
 import Git.Index
 import Git.Sha
 import Git.CatFile
 import Git.Branch (writeTreeQuiet, update')
 import qualified Git.Ref
 import Config.Smudge
@ -53,6 +53,7 @@ import qualified Utility.RawFilePath as R
 import qualified Data.ByteString as S
 import qualified Data.ByteString.Char8 as S8
 import qualified System.FilePath.ByteString as P
 import Control.Concurrent.Async
 {- Runs an action that reads from the database.
 -
@ -241,12 +242,16 @@ reconcileStaged qh = do
 		<$> catchMaybeIO (readFile indexcache)
 	getoldtree = fromMaybe emptyTree <$> inRepo (Git.Ref.sha lastindexref)
-
+	
 	go cur indexcache (Just newtree) = do
 		oldtree <- getoldtree
 		when (oldtree /= newtree) $ do
-			updatetodiff (Just (fromRef oldtree)) (fromRef newtree) procdiff
+			g <- Annex.gitRepo
-				>>= flushdb . fst
+			void $ catstream $ \mdfeeder -> 
 				void $ updatetodiff g
 					(Just (fromRef oldtree)) 
 					(fromRef newtree)
 					(procdiff mdfeeder)
 			liftIO $ writeFile indexcache $ showInodeCache cur
 			-- Storing the tree in a ref makes sure it does not
 			-- get garbage collected, and is available to diff
@ -264,24 +269,18 @@ reconcileStaged qh = do
 	-- is done, with --staged but no old tree.
 	go _ _ Nothing = do
 		oldtree <- getoldtree
-		(changed, conflicted) <- updatetodiff
+		g <- Annex.gitRepo
-			(Just (fromRef oldtree)) "--staged" procdiff
+		catstream $ \mdfeeder -> do
-		changed' <- if conflicted
+			conflicted <- updatetodiff g
-			then fst <$> updatetodiff Nothing "--staged"
+				(Just (fromRef oldtree)) "--staged" (procdiff mdfeeder)
-				procmergeconflictdiff
+			when conflicted $
-			else pure False
+				void $ updatetodiff g Nothing "--staged"
-		flushdb (changed || changed')
+					(procmergeconflictdiff mdfeeder)
 	updatetodiff old new processor = do
 		(l, cleanup) <- inRepo $ pipeNullSplit' $ diff old new
 		processor l False False
 			`finally` void (liftIO cleanup)
-	-- Flush database changes immediately
+	updatetodiff g old new processor = do
-	-- so other processes can see them.
+		(l, cleanup) <- pipeNullSplit' (diff old new) g
-	flushdb changed
+		processor l False
-		| changed = liftIO $ H.flushDbQueue qh
+			`finally` void cleanup
 		| otherwise = noop
 	-- Avoid running smudge clean filter, which would block trying to
 	-- access the locked database. git write-tree sometimes calls it,
@ -320,22 +319,21 @@ reconcileStaged qh = do
 		, Param "--no-ext-diff"
 		]
-	procdiff (info:file:rest) changed conflicted
+	procdiff mdfeeder (info:file:rest) conflicted
 		| ":" `S.isPrefixOf` info = case S8.words info of
 			(_colonsrcmode:dstmode:srcsha:dstsha:status:[]) -> do
 				let conflicted' = status == "U"
 				-- avoid removing associated file when
 				-- there is a merge conflict
-				removed <- if not conflicted'
+				unless conflicted' $
-					then catKey (Ref srcsha) >>= \case
+					send mdfeeder (Ref srcsha) $ \case
 						Just oldkey -> do
 							liftIO $ SQL.removeAssociatedFile oldkey
 								(asTopFilePath file)
 								(SQL.WriteHandle qh)
 							return True
 						Nothing -> return False
-					else return False
+				send mdfeeder (Ref dstsha) $ \case
 				added <- catKey (Ref dstsha) >>= \case
 					Just key -> do
 						liftIO $ SQL.addAssociatedFile key
 							(asTopFilePath file)
@ -344,32 +342,30 @@ reconcileStaged qh = do
 							reconcilerace (asTopFilePath file) key
 						return True
 					Nothing -> return False
-				procdiff rest
+				procdiff mdfeeder rest
 					(changed || removed || added)
 					(conflicted || conflicted')
-			_ -> return (changed, conflicted) -- parse failed
+			_ -> return conflicted -- parse failed
-	procdiff _ changed conflicted = return (changed, conflicted)
+	procdiff _ _ conflicted = return conflicted
-	
+
 	-- Processing a diff --index when there is a merge conflict.
 	-- This diff will have the new local version of a file as the
 	-- first sha, and a null sha as the second sha, and we only
 	-- care about files that are in conflict.
-	procmergeconflictdiff (info:file:rest) changed conflicted
+	procmergeconflictdiff mdfeeder (info:file:rest) conflicted
 		| ":" `S.isPrefixOf` info = case S8.words info of
 			(_colonmode:_mode:sha:_sha:status:[]) -> do
-				let conflicted' = status == "U"
+				send mdfeeder (Ref sha) $ \case
 				added <- catKey (Ref sha) >>= \case
 					Just key -> do
 						liftIO $ SQL.addAssociatedFile key
 							(asTopFilePath file)
 							(SQL.WriteHandle qh)
 						return True
 					Nothing -> return False
-				procmergeconflictdiff rest
+				let conflicted' = status == "U"
-					(changed || added)
+				procmergeconflictdiff mdfeeder rest
 					(conflicted || conflicted')
-			_ -> return (changed, conflicted) -- parse failed
+			_ -> return conflicted -- parse failed
-	procmergeconflictdiff _ changed conflicted = return (changed, conflicted)
+	procmergeconflictdiff _ _ conflicted = return conflicted
 	reconcilerace file key = do
 		caches <- liftIO $ SQL.getInodeCaches key (SQL.ReadHandle qh)
@ -385,3 +381,41 @@ reconcileStaged qh = do
 						SQL.addInodeCaches key [ic] (SQL.WriteHandle qh)
 			(False, True) -> depopulatePointerFile key p
 			_ -> return ()
 	send :: ((Maybe Key -> Annex a, Ref) -> IO ()) -> Ref -> (Maybe Key -> Annex a) -> IO ()
 	send feeder r withk = feeder (withk, r)
 	-- Streaming through git cat-file like this is significantly
 	-- faster than using catKey.
 	catstream a = do
 		g <- Annex.gitRepo
 		catObjectMetaDataStream g $ \mdfeeder mdcloser mdreader ->
 			catObjectStream g $ \catfeeder catcloser catreader -> do
 				feedt <- liftIO $ async $
 					a mdfeeder
 						`finally` void mdcloser
 				proct <- liftIO $ async $
 					procthread mdreader catfeeder
 						`finally` void catcloser
 				dbchanged <- dbwriter False catreader
 				-- Flush database changes now
 				-- so other processes can see them.
 				when dbchanged $
 					liftIO $ H.flushDbQueue qh
 				() <- liftIO $ wait feedt
 				liftIO $ wait proct
 				return ()
 	  where
 		procthread mdreader catfeeder = mdreader >>= \case
 			Just (ka, Just (sha, size, _type))
 				| size < maxPointerSz -> do
 					() <- catfeeder (ka, sha)
 					procthread mdreader catfeeder
 			Just _ -> procthread mdreader catfeeder
 			Nothing -> return ()
 		dbwriter dbchanged catreader = liftIO catreader >>= \case
 			Just (ka, content) -> do
 				changed <- ka (parseLinkTargetOrPointerLazy =<< content)
 				dbwriter (dbchanged || changed) catreader
 			Nothing -> return dbchanged
--- a/doc/todo/speed_up_keys_db_update_with_git_streaming.mdwn
+++ b/doc/todo/speed_up_keys_db_update_with_git_streaming.mdwn
@ -5,3 +5,15 @@ Normally it's plenty fast enough, but users who often switch between
 branches that have tens to hundreds of thousands of diverged files will
 find it slow, and this should speed it up by somewhere around 3x (excluding
 sqlite writes). --[[Joey]]
 > Implemented this. Benchmarked it in a situation where 100,000 annexed 
 > files were added to the index (by checking out a branch with more annexed
 > files). old: 50 seconds; new: 41 seconds
 > Also benchmarked when 100,000 annexed files were removed from the index.
 > old: 26 seconds; new: 17 seconds.
 > 
 > Adding associated files to the sqlite db is clearly more expensive than
 > removing from it.
 > 
 > [[done]] --[[Joey]]