optimise reconcileStaged with git cat-file streaming
Commit 428c91606b
made it need to do more
work in situations like switching between very different branches.
Compare with seekFilteredKeys which has a similar optimisation. Might be
possible to factor out the common part from these?
Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
parent
70dbe61fc2
commit
6ceb31a30a
2 changed files with 84 additions and 38 deletions
110
Database/Keys.hs
110
Database/Keys.hs
|
@ -34,7 +34,6 @@ import Annex.Locations
|
|||
import Annex.Common hiding (delete)
|
||||
import qualified Annex
|
||||
import Annex.LockFile
|
||||
import Annex.CatFile
|
||||
import Annex.Content.PointerFile
|
||||
import Annex.Link
|
||||
import Utility.InodeCache
|
||||
|
@ -45,6 +44,7 @@ import Git.Command
|
|||
import Git.Types
|
||||
import Git.Index
|
||||
import Git.Sha
|
||||
import Git.CatFile
|
||||
import Git.Branch (writeTreeQuiet, update')
|
||||
import qualified Git.Ref
|
||||
import Config.Smudge
|
||||
|
@ -53,6 +53,7 @@ import qualified Utility.RawFilePath as R
|
|||
import qualified Data.ByteString as S
|
||||
import qualified Data.ByteString.Char8 as S8
|
||||
import qualified System.FilePath.ByteString as P
|
||||
import Control.Concurrent.Async
|
||||
|
||||
{- Runs an action that reads from the database.
|
||||
-
|
||||
|
@ -241,12 +242,16 @@ reconcileStaged qh = do
|
|||
<$> catchMaybeIO (readFile indexcache)
|
||||
|
||||
getoldtree = fromMaybe emptyTree <$> inRepo (Git.Ref.sha lastindexref)
|
||||
|
||||
|
||||
go cur indexcache (Just newtree) = do
|
||||
oldtree <- getoldtree
|
||||
when (oldtree /= newtree) $ do
|
||||
updatetodiff (Just (fromRef oldtree)) (fromRef newtree) procdiff
|
||||
>>= flushdb . fst
|
||||
g <- Annex.gitRepo
|
||||
void $ catstream $ \mdfeeder ->
|
||||
void $ updatetodiff g
|
||||
(Just (fromRef oldtree))
|
||||
(fromRef newtree)
|
||||
(procdiff mdfeeder)
|
||||
liftIO $ writeFile indexcache $ showInodeCache cur
|
||||
-- Storing the tree in a ref makes sure it does not
|
||||
-- get garbage collected, and is available to diff
|
||||
|
@ -264,24 +269,18 @@ reconcileStaged qh = do
|
|||
-- is done, with --staged but no old tree.
|
||||
go _ _ Nothing = do
|
||||
oldtree <- getoldtree
|
||||
(changed, conflicted) <- updatetodiff
|
||||
(Just (fromRef oldtree)) "--staged" procdiff
|
||||
changed' <- if conflicted
|
||||
then fst <$> updatetodiff Nothing "--staged"
|
||||
procmergeconflictdiff
|
||||
else pure False
|
||||
flushdb (changed || changed')
|
||||
|
||||
updatetodiff old new processor = do
|
||||
(l, cleanup) <- inRepo $ pipeNullSplit' $ diff old new
|
||||
processor l False False
|
||||
`finally` void (liftIO cleanup)
|
||||
g <- Annex.gitRepo
|
||||
catstream $ \mdfeeder -> do
|
||||
conflicted <- updatetodiff g
|
||||
(Just (fromRef oldtree)) "--staged" (procdiff mdfeeder)
|
||||
when conflicted $
|
||||
void $ updatetodiff g Nothing "--staged"
|
||||
(procmergeconflictdiff mdfeeder)
|
||||
|
||||
-- Flush database changes immediately
|
||||
-- so other processes can see them.
|
||||
flushdb changed
|
||||
| changed = liftIO $ H.flushDbQueue qh
|
||||
| otherwise = noop
|
||||
updatetodiff g old new processor = do
|
||||
(l, cleanup) <- pipeNullSplit' (diff old new) g
|
||||
processor l False
|
||||
`finally` void cleanup
|
||||
|
||||
-- Avoid running smudge clean filter, which would block trying to
|
||||
-- access the locked database. git write-tree sometimes calls it,
|
||||
|
@ -320,22 +319,21 @@ reconcileStaged qh = do
|
|||
, Param "--no-ext-diff"
|
||||
]
|
||||
|
||||
procdiff (info:file:rest) changed conflicted
|
||||
procdiff mdfeeder (info:file:rest) conflicted
|
||||
| ":" `S.isPrefixOf` info = case S8.words info of
|
||||
(_colonsrcmode:dstmode:srcsha:dstsha:status:[]) -> do
|
||||
let conflicted' = status == "U"
|
||||
-- avoid removing associated file when
|
||||
-- there is a merge conflict
|
||||
removed <- if not conflicted'
|
||||
then catKey (Ref srcsha) >>= \case
|
||||
unless conflicted' $
|
||||
send mdfeeder (Ref srcsha) $ \case
|
||||
Just oldkey -> do
|
||||
liftIO $ SQL.removeAssociatedFile oldkey
|
||||
(asTopFilePath file)
|
||||
(SQL.WriteHandle qh)
|
||||
return True
|
||||
Nothing -> return False
|
||||
else return False
|
||||
added <- catKey (Ref dstsha) >>= \case
|
||||
send mdfeeder (Ref dstsha) $ \case
|
||||
Just key -> do
|
||||
liftIO $ SQL.addAssociatedFile key
|
||||
(asTopFilePath file)
|
||||
|
@ -344,32 +342,30 @@ reconcileStaged qh = do
|
|||
reconcilerace (asTopFilePath file) key
|
||||
return True
|
||||
Nothing -> return False
|
||||
procdiff rest
|
||||
(changed || removed || added)
|
||||
procdiff mdfeeder rest
|
||||
(conflicted || conflicted')
|
||||
_ -> return (changed, conflicted) -- parse failed
|
||||
procdiff _ changed conflicted = return (changed, conflicted)
|
||||
|
||||
_ -> return conflicted -- parse failed
|
||||
procdiff _ _ conflicted = return conflicted
|
||||
|
||||
-- Processing a diff --index when there is a merge conflict.
|
||||
-- This diff will have the new local version of a file as the
|
||||
-- first sha, and a null sha as the second sha, and we only
|
||||
-- care about files that are in conflict.
|
||||
procmergeconflictdiff (info:file:rest) changed conflicted
|
||||
procmergeconflictdiff mdfeeder (info:file:rest) conflicted
|
||||
| ":" `S.isPrefixOf` info = case S8.words info of
|
||||
(_colonmode:_mode:sha:_sha:status:[]) -> do
|
||||
let conflicted' = status == "U"
|
||||
added <- catKey (Ref sha) >>= \case
|
||||
send mdfeeder (Ref sha) $ \case
|
||||
Just key -> do
|
||||
liftIO $ SQL.addAssociatedFile key
|
||||
(asTopFilePath file)
|
||||
(SQL.WriteHandle qh)
|
||||
return True
|
||||
Nothing -> return False
|
||||
procmergeconflictdiff rest
|
||||
(changed || added)
|
||||
let conflicted' = status == "U"
|
||||
procmergeconflictdiff mdfeeder rest
|
||||
(conflicted || conflicted')
|
||||
_ -> return (changed, conflicted) -- parse failed
|
||||
procmergeconflictdiff _ changed conflicted = return (changed, conflicted)
|
||||
_ -> return conflicted -- parse failed
|
||||
procmergeconflictdiff _ _ conflicted = return conflicted
|
||||
|
||||
reconcilerace file key = do
|
||||
caches <- liftIO $ SQL.getInodeCaches key (SQL.ReadHandle qh)
|
||||
|
@ -385,3 +381,41 @@ reconcileStaged qh = do
|
|||
SQL.addInodeCaches key [ic] (SQL.WriteHandle qh)
|
||||
(False, True) -> depopulatePointerFile key p
|
||||
_ -> return ()
|
||||
|
||||
send :: ((Maybe Key -> Annex a, Ref) -> IO ()) -> Ref -> (Maybe Key -> Annex a) -> IO ()
|
||||
send feeder r withk = feeder (withk, r)
|
||||
|
||||
-- Streaming through git cat-file like this is significantly
|
||||
-- faster than using catKey.
|
||||
catstream a = do
|
||||
g <- Annex.gitRepo
|
||||
catObjectMetaDataStream g $ \mdfeeder mdcloser mdreader ->
|
||||
catObjectStream g $ \catfeeder catcloser catreader -> do
|
||||
feedt <- liftIO $ async $
|
||||
a mdfeeder
|
||||
`finally` void mdcloser
|
||||
proct <- liftIO $ async $
|
||||
procthread mdreader catfeeder
|
||||
`finally` void catcloser
|
||||
dbchanged <- dbwriter False catreader
|
||||
-- Flush database changes now
|
||||
-- so other processes can see them.
|
||||
when dbchanged $
|
||||
liftIO $ H.flushDbQueue qh
|
||||
() <- liftIO $ wait feedt
|
||||
liftIO $ wait proct
|
||||
return ()
|
||||
where
|
||||
procthread mdreader catfeeder = mdreader >>= \case
|
||||
Just (ka, Just (sha, size, _type))
|
||||
| size < maxPointerSz -> do
|
||||
() <- catfeeder (ka, sha)
|
||||
procthread mdreader catfeeder
|
||||
Just _ -> procthread mdreader catfeeder
|
||||
Nothing -> return ()
|
||||
|
||||
dbwriter dbchanged catreader = liftIO catreader >>= \case
|
||||
Just (ka, content) -> do
|
||||
changed <- ka (parseLinkTargetOrPointerLazy =<< content)
|
||||
dbwriter (dbchanged || changed) catreader
|
||||
Nothing -> return dbchanged
|
||||
|
|
|
@ -5,3 +5,15 @@ Normally it's plenty fast enough, but users who often switch between
|
|||
branches that have tens to hundreds of thousands of diverged files will
|
||||
find it slow, and this should speed it up by somewhere around 3x (excluding
|
||||
sqlite writes). --[[Joey]]
|
||||
|
||||
> Implemented this. Benchmarked it in a situation where 100,000 annexed
|
||||
> files were added to the index (by checking out a branch with more annexed
|
||||
> files). old: 50 seconds; new: 41 seconds
|
||||
|
||||
> Also benchmarked when 100,000 annexed files were removed from the index.
|
||||
> old: 26 seconds; new: 17 seconds.
|
||||
>
|
||||
> Adding associated files to the sqlite db is clearly more expensive than
|
||||
> removing from it.
|
||||
>
|
||||
> [[done]] --[[Joey]]
|
||||
|
|
Loading…
Add table
Reference in a new issue