avoid double work in git-annex init
reconcileStaged was doing a redundant scan to scannAnnexedFiles. It would probably make sense to move the body of scannAnnexedFiles into reconcileStaged, the separation does not really serve any purpose. Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
parent
6ceb31a30a
commit
0f10f208a7
5 changed files with 93 additions and 38 deletions
Annex
Database
doc
bugs/significant_performance_regression_impacting_datal
comment_4_85d1031d2b51c0fc1271c283d8ee7888._commentcomment_5_b0f1e94e96cf8fe992425cf81f8d1105._comment
todo/Avoid_lengthy___34__Scanning_for_unlocked_files_...__34__
|
@ -70,27 +70,29 @@ ifAnnexed file yes no = maybe no yes =<< lookupKey file
|
|||
|
||||
{- Find all annexed files and update the keys database for them.
|
||||
-
|
||||
- This is expensive, and so normally the associated files are updated
|
||||
- incrementally when changes are noticed. So, this only needs to be done
|
||||
- when initializing/upgrading a repository.
|
||||
-
|
||||
- Also, the content for an unlocked file may already be present as
|
||||
- an annex object. If so, populate the pointer file with it.
|
||||
- But if worktree file does not have a pointer file's content, it is left
|
||||
- as-is.
|
||||
-
|
||||
- Normally the keys database is updated incrementally when changes are
|
||||
- noticed. For an initial scan, this is faster than that incremental
|
||||
- update.
|
||||
-}
|
||||
scanAnnexedFiles :: Annex ()
|
||||
scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ do
|
||||
g <- Annex.gitRepo
|
||||
Database.Keys.runWriter $
|
||||
liftIO . Database.Keys.SQL.dropAllAssociatedFiles
|
||||
(l, cleanup) <- inRepo $ Git.LsTree.lsTree
|
||||
Git.LsTree.LsTreeRecursive
|
||||
(Git.LsTree.LsTreeLong True)
|
||||
Git.Ref.headRef
|
||||
catObjectStreamLsTree l want g go
|
||||
liftIO $ void cleanup
|
||||
scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $
|
||||
Database.Keys.runWriter' (Just reconciler) (const noop)
|
||||
where
|
||||
reconciler dbh = do
|
||||
g <- Annex.gitRepo
|
||||
liftIO $ Database.Keys.SQL.dropAllAssociatedFiles dbh
|
||||
(l, cleanup) <- inRepo $ Git.LsTree.lsTree
|
||||
Git.LsTree.LsTreeRecursive
|
||||
(Git.LsTree.LsTreeLong True)
|
||||
Git.Ref.headRef
|
||||
catObjectStreamLsTree l want g (go dbh)
|
||||
liftIO $ void cleanup
|
||||
|
||||
-- Want to process symlinks, and regular files.
|
||||
want i = case Git.Types.toTreeItemType (Git.LsTree.mode i) of
|
||||
Just Git.Types.TreeSymlink -> Just (i, False)
|
||||
|
@ -103,17 +105,16 @@ scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ d
|
|||
Just n | n < maxPointerSz -> Just (i, True)
|
||||
_ -> Nothing
|
||||
|
||||
go getnext = liftIO getnext >>= \case
|
||||
go dbh getnext = liftIO getnext >>= \case
|
||||
Just ((i, isregfile), Just c) -> do
|
||||
maybe noop (add i isregfile)
|
||||
maybe noop (add i isregfile dbh)
|
||||
(parseLinkTargetOrPointer (L.toStrict c))
|
||||
go getnext
|
||||
go dbh getnext
|
||||
_ -> return ()
|
||||
|
||||
add i isregfile k = do
|
||||
add i isregfile dbh k = do
|
||||
let tf = Git.LsTree.file i
|
||||
Database.Keys.runWriter $
|
||||
liftIO . Database.Keys.SQL.addAssociatedFileFast k tf
|
||||
liftIO $ Database.Keys.SQL.addAssociatedFileFast k tf dbh
|
||||
whenM (pure isregfile <&&> inAnnex k) $ do
|
||||
f <- fromRepo $ fromTopFilePath tf
|
||||
liftIO (isPointerFile f) >>= \case
|
||||
|
|
|
@ -23,6 +23,7 @@ module Database.Keys (
|
|||
removeInodeCache,
|
||||
isInodeKnown,
|
||||
runWriter,
|
||||
runWriter',
|
||||
) where
|
||||
|
||||
import qualified Database.Keys.SQL as SQL
|
||||
|
@ -73,7 +74,7 @@ runReader a = do
|
|||
v <- a (SQL.ReadHandle qh)
|
||||
return (v, st)
|
||||
go DbClosed = do
|
||||
st' <- openDb True DbClosed
|
||||
st' <- openDb True Nothing DbClosed
|
||||
v <- case st' of
|
||||
(DbOpen qh) -> a (SQL.ReadHandle qh)
|
||||
_ -> return mempty
|
||||
|
@ -87,7 +88,15 @@ runReaderIO a = runReader (liftIO . a)
|
|||
-
|
||||
- The database is created if it doesn't exist yet. -}
|
||||
runWriter :: (SQL.WriteHandle -> Annex ()) -> Annex ()
|
||||
runWriter a = do
|
||||
runWriter = runWriter' Nothing
|
||||
|
||||
runWriterIO :: (SQL.WriteHandle -> IO ()) -> Annex ()
|
||||
runWriterIO a = runWriter (liftIO . a)
|
||||
|
||||
{- When a reconcile action is passed, it is run by reconcileStaged instead
|
||||
- of its usual scan, and must update the database in the same way. -}
|
||||
runWriter' :: Maybe (SQL.WriteHandle -> Annex ()) -> (SQL.WriteHandle -> Annex ()) -> Annex ()
|
||||
runWriter' reconciler a = do
|
||||
h <- Annex.getRead Annex.keysdbhandle
|
||||
withDbState h go
|
||||
where
|
||||
|
@ -95,15 +104,12 @@ runWriter a = do
|
|||
v <- a (SQL.WriteHandle qh)
|
||||
return (v, st)
|
||||
go st = do
|
||||
st' <- openDb False st
|
||||
st' <- openDb False reconciler st
|
||||
v <- case st' of
|
||||
DbOpen qh -> a (SQL.WriteHandle qh)
|
||||
_ -> error "internal"
|
||||
return (v, st')
|
||||
|
||||
runWriterIO :: (SQL.WriteHandle -> IO ()) -> Annex ()
|
||||
runWriterIO a = runWriter (liftIO . a)
|
||||
|
||||
{- Opens the database, creating it if it doesn't exist yet.
|
||||
-
|
||||
- Multiple readers and writers can have the database open at the same
|
||||
|
@ -112,10 +118,10 @@ runWriterIO a = runWriter (liftIO . a)
|
|||
- the database doesn't exist yet, one caller wins the lock and
|
||||
- can create it undisturbed.
|
||||
-}
|
||||
openDb :: Bool -> DbState -> Annex DbState
|
||||
openDb _ st@(DbOpen _) = return st
|
||||
openDb False DbUnavailable = return DbUnavailable
|
||||
openDb forwrite _ = catchPermissionDenied permerr $ withExclusiveLock gitAnnexKeysDbLock $ do
|
||||
openDb :: Bool -> (Maybe (SQL.WriteHandle -> Annex ())) -> DbState -> Annex DbState
|
||||
openDb _ _ st@(DbOpen _) = return st
|
||||
openDb False _ DbUnavailable = return DbUnavailable
|
||||
openDb forwrite reconciler _ = catchPermissionDenied permerr $ withExclusiveLock gitAnnexKeysDbLock $ do
|
||||
dbdir <- fromRepo gitAnnexKeysDb
|
||||
let db = dbdir P.</> "db"
|
||||
dbexists <- liftIO $ R.doesPathExist db
|
||||
|
@ -133,7 +139,7 @@ openDb forwrite _ = catchPermissionDenied permerr $ withExclusiveLock gitAnnexKe
|
|||
|
||||
open db = do
|
||||
qh <- liftIO $ H.openDbQueue H.MultiWriter db SQL.containedTable
|
||||
reconcileStaged qh
|
||||
reconcileStaged qh reconciler
|
||||
return $ DbOpen qh
|
||||
|
||||
{- Closes the database if it was open. Any writes will be flushed to it.
|
||||
|
@ -223,8 +229,8 @@ isInodeKnown i s = or <$> runReaderIO ((:[]) <$$> SQL.isInodeKnown i s)
|
|||
- So when using getAssociatedFiles, have to make sure the file still
|
||||
- is an associated file.
|
||||
-}
|
||||
reconcileStaged :: H.DbQueue -> Annex ()
|
||||
reconcileStaged qh = do
|
||||
reconcileStaged :: H.DbQueue -> Maybe (SQL.WriteHandle -> Annex ()) -> Annex ()
|
||||
reconcileStaged qh mreconciler = do
|
||||
gitindex <- inRepo currentIndexFile
|
||||
indexcache <- fromRawFilePath <$> fromRepo gitAnnexKeysDbIndexCache
|
||||
withTSDelta (liftIO . genInodeCache gitindex) >>= \case
|
||||
|
@ -246,12 +252,21 @@ reconcileStaged qh = do
|
|||
go cur indexcache (Just newtree) = do
|
||||
oldtree <- getoldtree
|
||||
when (oldtree /= newtree) $ do
|
||||
case mreconciler of
|
||||
Just reconciler ->
|
||||
reconciler (SQL.WriteHandle qh)
|
||||
Nothing -> noop
|
||||
g <- Annex.gitRepo
|
||||
void $ catstream $ \mdfeeder ->
|
||||
void $ updatetodiff g
|
||||
(Just (fromRef oldtree))
|
||||
(fromRef newtree)
|
||||
(procdiff mdfeeder)
|
||||
void $ catstream $ \mdfeeder -> do
|
||||
case mreconciler of
|
||||
Nothing -> void $ updatetodiff g
|
||||
(Just (fromRef oldtree))
|
||||
(fromRef newtree)
|
||||
(procdiff mdfeeder)
|
||||
Just _ -> void $ updatetodiff g
|
||||
Nothing "--staged"
|
||||
(procdiff mdfeeder)
|
||||
|
||||
liftIO $ writeFile indexcache $ showInodeCache cur
|
||||
-- Storing the tree in a ref makes sure it does not
|
||||
-- get garbage collected, and is available to diff
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
[[!comment format=mdwn
|
||||
username="joey"
|
||||
subject="""comment 4"""
|
||||
date="2021-06-07T20:22:24Z"
|
||||
content="""
|
||||
Turns out `git-annex init` got a lot slower than it had to, it was doing
|
||||
the same kind of scan twice. I think that probably explains much of the
|
||||
slowdown you saw. (Although it still needs to do somewhat more work than
|
||||
before, but also does it more efficiently than before.)
|
||||
|
||||
Also I've optimised the update scan's use of git, around a 20% speedup,
|
||||
although I don't know if your test case speed was impacted much by that
|
||||
scan anyway.
|
||||
"""]]
|
|
@ -0,0 +1,9 @@
|
|||
[[!comment format=mdwn
|
||||
username="joey"
|
||||
subject="""comment 5"""
|
||||
date="2021-06-07T20:42:34Z"
|
||||
content="""
|
||||
Looking at the CI log, I don't see any past runs that took 1h 46min.
|
||||
A month ago they were taking 2h 6min. Let's see if the changes I'm pushing
|
||||
now drop it back to that.
|
||||
"""]]
|
|
@ -0,0 +1,16 @@
|
|||
[[!comment format=mdwn
|
||||
username="joey"
|
||||
subject="""comment 20"""
|
||||
date="2021-06-07T19:22:03Z"
|
||||
content="""
|
||||
Turns out git-annex init was running both scanAnnexedFiles and
|
||||
reconcileStaged, which after recent changes to the latter, both do
|
||||
close to the same scan when run in a fresh clone. So double work!
|
||||
|
||||
Benchmarking with 100,000 files, git-annex init took 88 seconds.
|
||||
Fixed not to use reconcileStaged it took 37 seconds.
|
||||
|
||||
(Keeping reconcileStaged and removing scanAnnexedFiles it took 47 seconds.
|
||||
That makes sense; reconcileStaged is an incremental updater and is not
|
||||
able to use SQL as efficiently as scanAnnexedFiles.)
|
||||
"""]]
|
Loading…
Add table
Reference in a new issue