optimise reconcileStaged with git cat-file streaming

Commit 428c91606b made it need to do more
work in situations like switching between very different branches.

Compare with seekFilteredKeys which has a similar optimisation. Might be
possible to factor out the common part from these?

Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
Joey Hess 2021-06-07 14:51:38 -04:00
parent 70dbe61fc2
commit 6ceb31a30a
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
2 changed files with 84 additions and 38 deletions

View file

@ -34,7 +34,6 @@ import Annex.Locations
import Annex.Common hiding (delete) import Annex.Common hiding (delete)
import qualified Annex import qualified Annex
import Annex.LockFile import Annex.LockFile
import Annex.CatFile
import Annex.Content.PointerFile import Annex.Content.PointerFile
import Annex.Link import Annex.Link
import Utility.InodeCache import Utility.InodeCache
@ -45,6 +44,7 @@ import Git.Command
import Git.Types import Git.Types
import Git.Index import Git.Index
import Git.Sha import Git.Sha
import Git.CatFile
import Git.Branch (writeTreeQuiet, update') import Git.Branch (writeTreeQuiet, update')
import qualified Git.Ref import qualified Git.Ref
import Config.Smudge import Config.Smudge
@ -53,6 +53,7 @@ import qualified Utility.RawFilePath as R
import qualified Data.ByteString as S import qualified Data.ByteString as S
import qualified Data.ByteString.Char8 as S8 import qualified Data.ByteString.Char8 as S8
import qualified System.FilePath.ByteString as P import qualified System.FilePath.ByteString as P
import Control.Concurrent.Async
{- Runs an action that reads from the database. {- Runs an action that reads from the database.
- -
@ -241,12 +242,16 @@ reconcileStaged qh = do
<$> catchMaybeIO (readFile indexcache) <$> catchMaybeIO (readFile indexcache)
getoldtree = fromMaybe emptyTree <$> inRepo (Git.Ref.sha lastindexref) getoldtree = fromMaybe emptyTree <$> inRepo (Git.Ref.sha lastindexref)
go cur indexcache (Just newtree) = do go cur indexcache (Just newtree) = do
oldtree <- getoldtree oldtree <- getoldtree
when (oldtree /= newtree) $ do when (oldtree /= newtree) $ do
updatetodiff (Just (fromRef oldtree)) (fromRef newtree) procdiff g <- Annex.gitRepo
>>= flushdb . fst void $ catstream $ \mdfeeder ->
void $ updatetodiff g
(Just (fromRef oldtree))
(fromRef newtree)
(procdiff mdfeeder)
liftIO $ writeFile indexcache $ showInodeCache cur liftIO $ writeFile indexcache $ showInodeCache cur
-- Storing the tree in a ref makes sure it does not -- Storing the tree in a ref makes sure it does not
-- get garbage collected, and is available to diff -- get garbage collected, and is available to diff
@ -264,24 +269,18 @@ reconcileStaged qh = do
-- is done, with --staged but no old tree. -- is done, with --staged but no old tree.
go _ _ Nothing = do go _ _ Nothing = do
oldtree <- getoldtree oldtree <- getoldtree
(changed, conflicted) <- updatetodiff g <- Annex.gitRepo
(Just (fromRef oldtree)) "--staged" procdiff catstream $ \mdfeeder -> do
changed' <- if conflicted conflicted <- updatetodiff g
then fst <$> updatetodiff Nothing "--staged" (Just (fromRef oldtree)) "--staged" (procdiff mdfeeder)
procmergeconflictdiff when conflicted $
else pure False void $ updatetodiff g Nothing "--staged"
flushdb (changed || changed') (procmergeconflictdiff mdfeeder)
updatetodiff old new processor = do
(l, cleanup) <- inRepo $ pipeNullSplit' $ diff old new
processor l False False
`finally` void (liftIO cleanup)
-- Flush database changes immediately updatetodiff g old new processor = do
-- so other processes can see them. (l, cleanup) <- pipeNullSplit' (diff old new) g
flushdb changed processor l False
| changed = liftIO $ H.flushDbQueue qh `finally` void cleanup
| otherwise = noop
-- Avoid running smudge clean filter, which would block trying to -- Avoid running smudge clean filter, which would block trying to
-- access the locked database. git write-tree sometimes calls it, -- access the locked database. git write-tree sometimes calls it,
@ -320,22 +319,21 @@ reconcileStaged qh = do
, Param "--no-ext-diff" , Param "--no-ext-diff"
] ]
procdiff (info:file:rest) changed conflicted procdiff mdfeeder (info:file:rest) conflicted
| ":" `S.isPrefixOf` info = case S8.words info of | ":" `S.isPrefixOf` info = case S8.words info of
(_colonsrcmode:dstmode:srcsha:dstsha:status:[]) -> do (_colonsrcmode:dstmode:srcsha:dstsha:status:[]) -> do
let conflicted' = status == "U" let conflicted' = status == "U"
-- avoid removing associated file when -- avoid removing associated file when
-- there is a merge conflict -- there is a merge conflict
removed <- if not conflicted' unless conflicted' $
then catKey (Ref srcsha) >>= \case send mdfeeder (Ref srcsha) $ \case
Just oldkey -> do Just oldkey -> do
liftIO $ SQL.removeAssociatedFile oldkey liftIO $ SQL.removeAssociatedFile oldkey
(asTopFilePath file) (asTopFilePath file)
(SQL.WriteHandle qh) (SQL.WriteHandle qh)
return True return True
Nothing -> return False Nothing -> return False
else return False send mdfeeder (Ref dstsha) $ \case
added <- catKey (Ref dstsha) >>= \case
Just key -> do Just key -> do
liftIO $ SQL.addAssociatedFile key liftIO $ SQL.addAssociatedFile key
(asTopFilePath file) (asTopFilePath file)
@ -344,32 +342,30 @@ reconcileStaged qh = do
reconcilerace (asTopFilePath file) key reconcilerace (asTopFilePath file) key
return True return True
Nothing -> return False Nothing -> return False
procdiff rest procdiff mdfeeder rest
(changed || removed || added)
(conflicted || conflicted') (conflicted || conflicted')
_ -> return (changed, conflicted) -- parse failed _ -> return conflicted -- parse failed
procdiff _ changed conflicted = return (changed, conflicted) procdiff _ _ conflicted = return conflicted
-- Processing a diff --index when there is a merge conflict. -- Processing a diff --index when there is a merge conflict.
-- This diff will have the new local version of a file as the -- This diff will have the new local version of a file as the
-- first sha, and a null sha as the second sha, and we only -- first sha, and a null sha as the second sha, and we only
-- care about files that are in conflict. -- care about files that are in conflict.
procmergeconflictdiff (info:file:rest) changed conflicted procmergeconflictdiff mdfeeder (info:file:rest) conflicted
| ":" `S.isPrefixOf` info = case S8.words info of | ":" `S.isPrefixOf` info = case S8.words info of
(_colonmode:_mode:sha:_sha:status:[]) -> do (_colonmode:_mode:sha:_sha:status:[]) -> do
let conflicted' = status == "U" send mdfeeder (Ref sha) $ \case
added <- catKey (Ref sha) >>= \case
Just key -> do Just key -> do
liftIO $ SQL.addAssociatedFile key liftIO $ SQL.addAssociatedFile key
(asTopFilePath file) (asTopFilePath file)
(SQL.WriteHandle qh) (SQL.WriteHandle qh)
return True return True
Nothing -> return False Nothing -> return False
procmergeconflictdiff rest let conflicted' = status == "U"
(changed || added) procmergeconflictdiff mdfeeder rest
(conflicted || conflicted') (conflicted || conflicted')
_ -> return (changed, conflicted) -- parse failed _ -> return conflicted -- parse failed
procmergeconflictdiff _ changed conflicted = return (changed, conflicted) procmergeconflictdiff _ _ conflicted = return conflicted
reconcilerace file key = do reconcilerace file key = do
caches <- liftIO $ SQL.getInodeCaches key (SQL.ReadHandle qh) caches <- liftIO $ SQL.getInodeCaches key (SQL.ReadHandle qh)
@ -385,3 +381,41 @@ reconcileStaged qh = do
SQL.addInodeCaches key [ic] (SQL.WriteHandle qh) SQL.addInodeCaches key [ic] (SQL.WriteHandle qh)
(False, True) -> depopulatePointerFile key p (False, True) -> depopulatePointerFile key p
_ -> return () _ -> return ()
send :: ((Maybe Key -> Annex a, Ref) -> IO ()) -> Ref -> (Maybe Key -> Annex a) -> IO ()
send feeder r withk = feeder (withk, r)
-- Streaming through git cat-file like this is significantly
-- faster than using catKey.
catstream a = do
g <- Annex.gitRepo
catObjectMetaDataStream g $ \mdfeeder mdcloser mdreader ->
catObjectStream g $ \catfeeder catcloser catreader -> do
feedt <- liftIO $ async $
a mdfeeder
`finally` void mdcloser
proct <- liftIO $ async $
procthread mdreader catfeeder
`finally` void catcloser
dbchanged <- dbwriter False catreader
-- Flush database changes now
-- so other processes can see them.
when dbchanged $
liftIO $ H.flushDbQueue qh
() <- liftIO $ wait feedt
liftIO $ wait proct
return ()
where
procthread mdreader catfeeder = mdreader >>= \case
Just (ka, Just (sha, size, _type))
| size < maxPointerSz -> do
() <- catfeeder (ka, sha)
procthread mdreader catfeeder
Just _ -> procthread mdreader catfeeder
Nothing -> return ()
dbwriter dbchanged catreader = liftIO catreader >>= \case
Just (ka, content) -> do
changed <- ka (parseLinkTargetOrPointerLazy =<< content)
dbwriter (dbchanged || changed) catreader
Nothing -> return dbchanged

View file

@ -5,3 +5,15 @@ Normally it's plenty fast enough, but users who often switch between
branches that have tens to hundreds of thousands of diverged files will branches that have tens to hundreds of thousands of diverged files will
find it slow, and this should speed it up by somewhere around 3x (excluding find it slow, and this should speed it up by somewhere around 3x (excluding
sqlite writes). --[[Joey]] sqlite writes). --[[Joey]]
> Implemented this. Benchmarked it in a situation where 100,000 annexed
> files were added to the index (by checking out a branch with more annexed
> files). old: 50 seconds; new: 41 seconds
> Also benchmarked when 100,000 annexed files were removed from the index.
> old: 26 seconds; new: 17 seconds.
>
> Adding associated files to the sqlite db is clearly more expensive than
> removing from it.
>
> [[done]] --[[Joey]]