optimise reconcileStaged with git cat-file streaming
Commit 428c91606b
made it need to do more
work in situations like switching between very different branches.
Compare with seekFilteredKeys which has a similar optimisation. Might be
possible to factor out the common part from these?
Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
parent
70dbe61fc2
commit
6ceb31a30a
2 changed files with 84 additions and 38 deletions
110
Database/Keys.hs
110
Database/Keys.hs
|
@ -34,7 +34,6 @@ import Annex.Locations
|
||||||
import Annex.Common hiding (delete)
|
import Annex.Common hiding (delete)
|
||||||
import qualified Annex
|
import qualified Annex
|
||||||
import Annex.LockFile
|
import Annex.LockFile
|
||||||
import Annex.CatFile
|
|
||||||
import Annex.Content.PointerFile
|
import Annex.Content.PointerFile
|
||||||
import Annex.Link
|
import Annex.Link
|
||||||
import Utility.InodeCache
|
import Utility.InodeCache
|
||||||
|
@ -45,6 +44,7 @@ import Git.Command
|
||||||
import Git.Types
|
import Git.Types
|
||||||
import Git.Index
|
import Git.Index
|
||||||
import Git.Sha
|
import Git.Sha
|
||||||
|
import Git.CatFile
|
||||||
import Git.Branch (writeTreeQuiet, update')
|
import Git.Branch (writeTreeQuiet, update')
|
||||||
import qualified Git.Ref
|
import qualified Git.Ref
|
||||||
import Config.Smudge
|
import Config.Smudge
|
||||||
|
@ -53,6 +53,7 @@ import qualified Utility.RawFilePath as R
|
||||||
import qualified Data.ByteString as S
|
import qualified Data.ByteString as S
|
||||||
import qualified Data.ByteString.Char8 as S8
|
import qualified Data.ByteString.Char8 as S8
|
||||||
import qualified System.FilePath.ByteString as P
|
import qualified System.FilePath.ByteString as P
|
||||||
|
import Control.Concurrent.Async
|
||||||
|
|
||||||
{- Runs an action that reads from the database.
|
{- Runs an action that reads from the database.
|
||||||
-
|
-
|
||||||
|
@ -241,12 +242,16 @@ reconcileStaged qh = do
|
||||||
<$> catchMaybeIO (readFile indexcache)
|
<$> catchMaybeIO (readFile indexcache)
|
||||||
|
|
||||||
getoldtree = fromMaybe emptyTree <$> inRepo (Git.Ref.sha lastindexref)
|
getoldtree = fromMaybe emptyTree <$> inRepo (Git.Ref.sha lastindexref)
|
||||||
|
|
||||||
go cur indexcache (Just newtree) = do
|
go cur indexcache (Just newtree) = do
|
||||||
oldtree <- getoldtree
|
oldtree <- getoldtree
|
||||||
when (oldtree /= newtree) $ do
|
when (oldtree /= newtree) $ do
|
||||||
updatetodiff (Just (fromRef oldtree)) (fromRef newtree) procdiff
|
g <- Annex.gitRepo
|
||||||
>>= flushdb . fst
|
void $ catstream $ \mdfeeder ->
|
||||||
|
void $ updatetodiff g
|
||||||
|
(Just (fromRef oldtree))
|
||||||
|
(fromRef newtree)
|
||||||
|
(procdiff mdfeeder)
|
||||||
liftIO $ writeFile indexcache $ showInodeCache cur
|
liftIO $ writeFile indexcache $ showInodeCache cur
|
||||||
-- Storing the tree in a ref makes sure it does not
|
-- Storing the tree in a ref makes sure it does not
|
||||||
-- get garbage collected, and is available to diff
|
-- get garbage collected, and is available to diff
|
||||||
|
@ -264,24 +269,18 @@ reconcileStaged qh = do
|
||||||
-- is done, with --staged but no old tree.
|
-- is done, with --staged but no old tree.
|
||||||
go _ _ Nothing = do
|
go _ _ Nothing = do
|
||||||
oldtree <- getoldtree
|
oldtree <- getoldtree
|
||||||
(changed, conflicted) <- updatetodiff
|
g <- Annex.gitRepo
|
||||||
(Just (fromRef oldtree)) "--staged" procdiff
|
catstream $ \mdfeeder -> do
|
||||||
changed' <- if conflicted
|
conflicted <- updatetodiff g
|
||||||
then fst <$> updatetodiff Nothing "--staged"
|
(Just (fromRef oldtree)) "--staged" (procdiff mdfeeder)
|
||||||
procmergeconflictdiff
|
when conflicted $
|
||||||
else pure False
|
void $ updatetodiff g Nothing "--staged"
|
||||||
flushdb (changed || changed')
|
(procmergeconflictdiff mdfeeder)
|
||||||
|
|
||||||
updatetodiff old new processor = do
|
|
||||||
(l, cleanup) <- inRepo $ pipeNullSplit' $ diff old new
|
|
||||||
processor l False False
|
|
||||||
`finally` void (liftIO cleanup)
|
|
||||||
|
|
||||||
-- Flush database changes immediately
|
updatetodiff g old new processor = do
|
||||||
-- so other processes can see them.
|
(l, cleanup) <- pipeNullSplit' (diff old new) g
|
||||||
flushdb changed
|
processor l False
|
||||||
| changed = liftIO $ H.flushDbQueue qh
|
`finally` void cleanup
|
||||||
| otherwise = noop
|
|
||||||
|
|
||||||
-- Avoid running smudge clean filter, which would block trying to
|
-- Avoid running smudge clean filter, which would block trying to
|
||||||
-- access the locked database. git write-tree sometimes calls it,
|
-- access the locked database. git write-tree sometimes calls it,
|
||||||
|
@ -320,22 +319,21 @@ reconcileStaged qh = do
|
||||||
, Param "--no-ext-diff"
|
, Param "--no-ext-diff"
|
||||||
]
|
]
|
||||||
|
|
||||||
procdiff (info:file:rest) changed conflicted
|
procdiff mdfeeder (info:file:rest) conflicted
|
||||||
| ":" `S.isPrefixOf` info = case S8.words info of
|
| ":" `S.isPrefixOf` info = case S8.words info of
|
||||||
(_colonsrcmode:dstmode:srcsha:dstsha:status:[]) -> do
|
(_colonsrcmode:dstmode:srcsha:dstsha:status:[]) -> do
|
||||||
let conflicted' = status == "U"
|
let conflicted' = status == "U"
|
||||||
-- avoid removing associated file when
|
-- avoid removing associated file when
|
||||||
-- there is a merge conflict
|
-- there is a merge conflict
|
||||||
removed <- if not conflicted'
|
unless conflicted' $
|
||||||
then catKey (Ref srcsha) >>= \case
|
send mdfeeder (Ref srcsha) $ \case
|
||||||
Just oldkey -> do
|
Just oldkey -> do
|
||||||
liftIO $ SQL.removeAssociatedFile oldkey
|
liftIO $ SQL.removeAssociatedFile oldkey
|
||||||
(asTopFilePath file)
|
(asTopFilePath file)
|
||||||
(SQL.WriteHandle qh)
|
(SQL.WriteHandle qh)
|
||||||
return True
|
return True
|
||||||
Nothing -> return False
|
Nothing -> return False
|
||||||
else return False
|
send mdfeeder (Ref dstsha) $ \case
|
||||||
added <- catKey (Ref dstsha) >>= \case
|
|
||||||
Just key -> do
|
Just key -> do
|
||||||
liftIO $ SQL.addAssociatedFile key
|
liftIO $ SQL.addAssociatedFile key
|
||||||
(asTopFilePath file)
|
(asTopFilePath file)
|
||||||
|
@ -344,32 +342,30 @@ reconcileStaged qh = do
|
||||||
reconcilerace (asTopFilePath file) key
|
reconcilerace (asTopFilePath file) key
|
||||||
return True
|
return True
|
||||||
Nothing -> return False
|
Nothing -> return False
|
||||||
procdiff rest
|
procdiff mdfeeder rest
|
||||||
(changed || removed || added)
|
|
||||||
(conflicted || conflicted')
|
(conflicted || conflicted')
|
||||||
_ -> return (changed, conflicted) -- parse failed
|
_ -> return conflicted -- parse failed
|
||||||
procdiff _ changed conflicted = return (changed, conflicted)
|
procdiff _ _ conflicted = return conflicted
|
||||||
|
|
||||||
-- Processing a diff --index when there is a merge conflict.
|
-- Processing a diff --index when there is a merge conflict.
|
||||||
-- This diff will have the new local version of a file as the
|
-- This diff will have the new local version of a file as the
|
||||||
-- first sha, and a null sha as the second sha, and we only
|
-- first sha, and a null sha as the second sha, and we only
|
||||||
-- care about files that are in conflict.
|
-- care about files that are in conflict.
|
||||||
procmergeconflictdiff (info:file:rest) changed conflicted
|
procmergeconflictdiff mdfeeder (info:file:rest) conflicted
|
||||||
| ":" `S.isPrefixOf` info = case S8.words info of
|
| ":" `S.isPrefixOf` info = case S8.words info of
|
||||||
(_colonmode:_mode:sha:_sha:status:[]) -> do
|
(_colonmode:_mode:sha:_sha:status:[]) -> do
|
||||||
let conflicted' = status == "U"
|
send mdfeeder (Ref sha) $ \case
|
||||||
added <- catKey (Ref sha) >>= \case
|
|
||||||
Just key -> do
|
Just key -> do
|
||||||
liftIO $ SQL.addAssociatedFile key
|
liftIO $ SQL.addAssociatedFile key
|
||||||
(asTopFilePath file)
|
(asTopFilePath file)
|
||||||
(SQL.WriteHandle qh)
|
(SQL.WriteHandle qh)
|
||||||
return True
|
return True
|
||||||
Nothing -> return False
|
Nothing -> return False
|
||||||
procmergeconflictdiff rest
|
let conflicted' = status == "U"
|
||||||
(changed || added)
|
procmergeconflictdiff mdfeeder rest
|
||||||
(conflicted || conflicted')
|
(conflicted || conflicted')
|
||||||
_ -> return (changed, conflicted) -- parse failed
|
_ -> return conflicted -- parse failed
|
||||||
procmergeconflictdiff _ changed conflicted = return (changed, conflicted)
|
procmergeconflictdiff _ _ conflicted = return conflicted
|
||||||
|
|
||||||
reconcilerace file key = do
|
reconcilerace file key = do
|
||||||
caches <- liftIO $ SQL.getInodeCaches key (SQL.ReadHandle qh)
|
caches <- liftIO $ SQL.getInodeCaches key (SQL.ReadHandle qh)
|
||||||
|
@ -385,3 +381,41 @@ reconcileStaged qh = do
|
||||||
SQL.addInodeCaches key [ic] (SQL.WriteHandle qh)
|
SQL.addInodeCaches key [ic] (SQL.WriteHandle qh)
|
||||||
(False, True) -> depopulatePointerFile key p
|
(False, True) -> depopulatePointerFile key p
|
||||||
_ -> return ()
|
_ -> return ()
|
||||||
|
|
||||||
|
send :: ((Maybe Key -> Annex a, Ref) -> IO ()) -> Ref -> (Maybe Key -> Annex a) -> IO ()
|
||||||
|
send feeder r withk = feeder (withk, r)
|
||||||
|
|
||||||
|
-- Streaming through git cat-file like this is significantly
|
||||||
|
-- faster than using catKey.
|
||||||
|
catstream a = do
|
||||||
|
g <- Annex.gitRepo
|
||||||
|
catObjectMetaDataStream g $ \mdfeeder mdcloser mdreader ->
|
||||||
|
catObjectStream g $ \catfeeder catcloser catreader -> do
|
||||||
|
feedt <- liftIO $ async $
|
||||||
|
a mdfeeder
|
||||||
|
`finally` void mdcloser
|
||||||
|
proct <- liftIO $ async $
|
||||||
|
procthread mdreader catfeeder
|
||||||
|
`finally` void catcloser
|
||||||
|
dbchanged <- dbwriter False catreader
|
||||||
|
-- Flush database changes now
|
||||||
|
-- so other processes can see them.
|
||||||
|
when dbchanged $
|
||||||
|
liftIO $ H.flushDbQueue qh
|
||||||
|
() <- liftIO $ wait feedt
|
||||||
|
liftIO $ wait proct
|
||||||
|
return ()
|
||||||
|
where
|
||||||
|
procthread mdreader catfeeder = mdreader >>= \case
|
||||||
|
Just (ka, Just (sha, size, _type))
|
||||||
|
| size < maxPointerSz -> do
|
||||||
|
() <- catfeeder (ka, sha)
|
||||||
|
procthread mdreader catfeeder
|
||||||
|
Just _ -> procthread mdreader catfeeder
|
||||||
|
Nothing -> return ()
|
||||||
|
|
||||||
|
dbwriter dbchanged catreader = liftIO catreader >>= \case
|
||||||
|
Just (ka, content) -> do
|
||||||
|
changed <- ka (parseLinkTargetOrPointerLazy =<< content)
|
||||||
|
dbwriter (dbchanged || changed) catreader
|
||||||
|
Nothing -> return dbchanged
|
||||||
|
|
|
@ -5,3 +5,15 @@ Normally it's plenty fast enough, but users who often switch between
|
||||||
branches that have tens to hundreds of thousands of diverged files will
|
branches that have tens to hundreds of thousands of diverged files will
|
||||||
find it slow, and this should speed it up by somewhere around 3x (excluding
|
find it slow, and this should speed it up by somewhere around 3x (excluding
|
||||||
sqlite writes). --[[Joey]]
|
sqlite writes). --[[Joey]]
|
||||||
|
|
||||||
|
> Implemented this. Benchmarked it in a situation where 100,000 annexed
|
||||||
|
> files were added to the index (by checking out a branch with more annexed
|
||||||
|
> files). old: 50 seconds; new: 41 seconds
|
||||||
|
|
||||||
|
> Also benchmarked when 100,000 annexed files were removed from the index.
|
||||||
|
> old: 26 seconds; new: 17 seconds.
|
||||||
|
>
|
||||||
|
> Adding associated files to the sqlite db is clearly more expensive than
|
||||||
|
> removing from it.
|
||||||
|
>
|
||||||
|
> [[done]] --[[Joey]]
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue