optimise reconcileStaged with git cat-file streaming

Commit 428c91606b made it need to do more
work in situations like switching between very different branches.

Compare with seekFilteredKeys which has a similar optimisation. Might be
possible to factor out the common part from these?

Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
Joey Hess 2021-06-07 14:51:38 -04:00
parent 70dbe61fc2
commit 6ceb31a30a
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
2 changed files with 84 additions and 38 deletions

View file

@ -34,7 +34,6 @@ import Annex.Locations
import Annex.Common hiding (delete)
import qualified Annex
import Annex.LockFile
import Annex.CatFile
import Annex.Content.PointerFile
import Annex.Link
import Utility.InodeCache
@ -45,6 +44,7 @@ import Git.Command
import Git.Types
import Git.Index
import Git.Sha
import Git.CatFile
import Git.Branch (writeTreeQuiet, update')
import qualified Git.Ref
import Config.Smudge
@ -53,6 +53,7 @@ import qualified Utility.RawFilePath as R
import qualified Data.ByteString as S
import qualified Data.ByteString.Char8 as S8
import qualified System.FilePath.ByteString as P
import Control.Concurrent.Async
{- Runs an action that reads from the database.
-
@ -241,12 +242,16 @@ reconcileStaged qh = do
<$> catchMaybeIO (readFile indexcache)
getoldtree = fromMaybe emptyTree <$> inRepo (Git.Ref.sha lastindexref)
go cur indexcache (Just newtree) = do
oldtree <- getoldtree
when (oldtree /= newtree) $ do
updatetodiff (Just (fromRef oldtree)) (fromRef newtree) procdiff
>>= flushdb . fst
g <- Annex.gitRepo
void $ catstream $ \mdfeeder ->
void $ updatetodiff g
(Just (fromRef oldtree))
(fromRef newtree)
(procdiff mdfeeder)
liftIO $ writeFile indexcache $ showInodeCache cur
-- Storing the tree in a ref makes sure it does not
-- get garbage collected, and is available to diff
@ -264,24 +269,18 @@ reconcileStaged qh = do
-- is done, with --staged but no old tree.
go _ _ Nothing = do
oldtree <- getoldtree
(changed, conflicted) <- updatetodiff
(Just (fromRef oldtree)) "--staged" procdiff
changed' <- if conflicted
then fst <$> updatetodiff Nothing "--staged"
procmergeconflictdiff
else pure False
flushdb (changed || changed')
updatetodiff old new processor = do
(l, cleanup) <- inRepo $ pipeNullSplit' $ diff old new
processor l False False
`finally` void (liftIO cleanup)
g <- Annex.gitRepo
catstream $ \mdfeeder -> do
conflicted <- updatetodiff g
(Just (fromRef oldtree)) "--staged" (procdiff mdfeeder)
when conflicted $
void $ updatetodiff g Nothing "--staged"
(procmergeconflictdiff mdfeeder)
-- Flush database changes immediately
-- so other processes can see them.
flushdb changed
| changed = liftIO $ H.flushDbQueue qh
| otherwise = noop
updatetodiff g old new processor = do
(l, cleanup) <- pipeNullSplit' (diff old new) g
processor l False
`finally` void cleanup
-- Avoid running smudge clean filter, which would block trying to
-- access the locked database. git write-tree sometimes calls it,
@ -320,22 +319,21 @@ reconcileStaged qh = do
, Param "--no-ext-diff"
]
procdiff (info:file:rest) changed conflicted
procdiff mdfeeder (info:file:rest) conflicted
| ":" `S.isPrefixOf` info = case S8.words info of
(_colonsrcmode:dstmode:srcsha:dstsha:status:[]) -> do
let conflicted' = status == "U"
-- avoid removing associated file when
-- there is a merge conflict
removed <- if not conflicted'
then catKey (Ref srcsha) >>= \case
unless conflicted' $
send mdfeeder (Ref srcsha) $ \case
Just oldkey -> do
liftIO $ SQL.removeAssociatedFile oldkey
(asTopFilePath file)
(SQL.WriteHandle qh)
return True
Nothing -> return False
else return False
added <- catKey (Ref dstsha) >>= \case
send mdfeeder (Ref dstsha) $ \case
Just key -> do
liftIO $ SQL.addAssociatedFile key
(asTopFilePath file)
@ -344,32 +342,30 @@ reconcileStaged qh = do
reconcilerace (asTopFilePath file) key
return True
Nothing -> return False
procdiff rest
(changed || removed || added)
procdiff mdfeeder rest
(conflicted || conflicted')
_ -> return (changed, conflicted) -- parse failed
procdiff _ changed conflicted = return (changed, conflicted)
_ -> return conflicted -- parse failed
procdiff _ _ conflicted = return conflicted
-- Processing a diff --index when there is a merge conflict.
-- This diff will have the new local version of a file as the
-- first sha, and a null sha as the second sha, and we only
-- care about files that are in conflict.
procmergeconflictdiff (info:file:rest) changed conflicted
procmergeconflictdiff mdfeeder (info:file:rest) conflicted
| ":" `S.isPrefixOf` info = case S8.words info of
(_colonmode:_mode:sha:_sha:status:[]) -> do
let conflicted' = status == "U"
added <- catKey (Ref sha) >>= \case
send mdfeeder (Ref sha) $ \case
Just key -> do
liftIO $ SQL.addAssociatedFile key
(asTopFilePath file)
(SQL.WriteHandle qh)
return True
Nothing -> return False
procmergeconflictdiff rest
(changed || added)
let conflicted' = status == "U"
procmergeconflictdiff mdfeeder rest
(conflicted || conflicted')
_ -> return (changed, conflicted) -- parse failed
procmergeconflictdiff _ changed conflicted = return (changed, conflicted)
_ -> return conflicted -- parse failed
procmergeconflictdiff _ _ conflicted = return conflicted
reconcilerace file key = do
caches <- liftIO $ SQL.getInodeCaches key (SQL.ReadHandle qh)
@ -385,3 +381,41 @@ reconcileStaged qh = do
SQL.addInodeCaches key [ic] (SQL.WriteHandle qh)
(False, True) -> depopulatePointerFile key p
_ -> return ()
send :: ((Maybe Key -> Annex a, Ref) -> IO ()) -> Ref -> (Maybe Key -> Annex a) -> IO ()
send feeder r withk = feeder (withk, r)
-- Streaming through git cat-file like this is significantly
-- faster than using catKey.
catstream a = do
g <- Annex.gitRepo
catObjectMetaDataStream g $ \mdfeeder mdcloser mdreader ->
catObjectStream g $ \catfeeder catcloser catreader -> do
feedt <- liftIO $ async $
a mdfeeder
`finally` void mdcloser
proct <- liftIO $ async $
procthread mdreader catfeeder
`finally` void catcloser
dbchanged <- dbwriter False catreader
-- Flush database changes now
-- so other processes can see them.
when dbchanged $
liftIO $ H.flushDbQueue qh
() <- liftIO $ wait feedt
liftIO $ wait proct
return ()
where
procthread mdreader catfeeder = mdreader >>= \case
Just (ka, Just (sha, size, _type))
| size < maxPointerSz -> do
() <- catfeeder (ka, sha)
procthread mdreader catfeeder
Just _ -> procthread mdreader catfeeder
Nothing -> return ()
dbwriter dbchanged catreader = liftIO catreader >>= \case
Just (ka, content) -> do
changed <- ka (parseLinkTargetOrPointerLazy =<< content)
dbwriter (dbchanged || changed) catreader
Nothing -> return dbchanged

View file

@ -5,3 +5,15 @@ Normally it's plenty fast enough, but users who often switch between
branches that have tens to hundreds of thousands of diverged files will
find it slow, and this should speed it up by somewhere around 3x (excluding
sqlite writes). --[[Joey]]
> Implemented this. Benchmarked it in a situation where 100,000 annexed
> files were added to the index (by checking out a branch with more annexed
> files). old: 50 seconds; new: 41 seconds
> Also benchmarked when 100,000 annexed files were removed from the index.
> old: 26 seconds; new: 17 seconds.
>
> Adding associated files to the sqlite db is clearly more expensive than
> removing from it.
>
> [[done]] --[[Joey]]