avoid double work in git-annex init

reconcileStaged was doing a redundant scan to scannAnnexedFiles.

It would probably make sense to move the body of scannAnnexedFiles
into reconcileStaged, the separation does not really serve any purpose.

Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
Joey Hess 2021-06-07 16:50:14 -04:00
parent 6ceb31a30a
commit 0f10f208a7
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
5 changed files with 93 additions and 38 deletions

View file

@ -69,28 +69,30 @@ ifAnnexed :: RawFilePath -> (Key -> Annex a) -> Annex a -> Annex a
ifAnnexed file yes no = maybe no yes =<< lookupKey file ifAnnexed file yes no = maybe no yes =<< lookupKey file
{- Find all annexed files and update the keys database for them. {- Find all annexed files and update the keys database for them.
-
- This is expensive, and so normally the associated files are updated
- incrementally when changes are noticed. So, this only needs to be done
- when initializing/upgrading a repository.
- -
- Also, the content for an unlocked file may already be present as - Also, the content for an unlocked file may already be present as
- an annex object. If so, populate the pointer file with it. - an annex object. If so, populate the pointer file with it.
- But if worktree file does not have a pointer file's content, it is left - But if worktree file does not have a pointer file's content, it is left
- as-is. - as-is.
-
- Normally the keys database is updated incrementally when changes are
- noticed. For an initial scan, this is faster than that incremental
- update.
-} -}
scanAnnexedFiles :: Annex () scanAnnexedFiles :: Annex ()
scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ do scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $
g <- Annex.gitRepo Database.Keys.runWriter' (Just reconciler) (const noop)
Database.Keys.runWriter $
liftIO . Database.Keys.SQL.dropAllAssociatedFiles
(l, cleanup) <- inRepo $ Git.LsTree.lsTree
Git.LsTree.LsTreeRecursive
(Git.LsTree.LsTreeLong True)
Git.Ref.headRef
catObjectStreamLsTree l want g go
liftIO $ void cleanup
where where
reconciler dbh = do
g <- Annex.gitRepo
liftIO $ Database.Keys.SQL.dropAllAssociatedFiles dbh
(l, cleanup) <- inRepo $ Git.LsTree.lsTree
Git.LsTree.LsTreeRecursive
(Git.LsTree.LsTreeLong True)
Git.Ref.headRef
catObjectStreamLsTree l want g (go dbh)
liftIO $ void cleanup
-- Want to process symlinks, and regular files. -- Want to process symlinks, and regular files.
want i = case Git.Types.toTreeItemType (Git.LsTree.mode i) of want i = case Git.Types.toTreeItemType (Git.LsTree.mode i) of
Just Git.Types.TreeSymlink -> Just (i, False) Just Git.Types.TreeSymlink -> Just (i, False)
@ -103,17 +105,16 @@ scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ d
Just n | n < maxPointerSz -> Just (i, True) Just n | n < maxPointerSz -> Just (i, True)
_ -> Nothing _ -> Nothing
go getnext = liftIO getnext >>= \case go dbh getnext = liftIO getnext >>= \case
Just ((i, isregfile), Just c) -> do Just ((i, isregfile), Just c) -> do
maybe noop (add i isregfile) maybe noop (add i isregfile dbh)
(parseLinkTargetOrPointer (L.toStrict c)) (parseLinkTargetOrPointer (L.toStrict c))
go getnext go dbh getnext
_ -> return () _ -> return ()
add i isregfile k = do add i isregfile dbh k = do
let tf = Git.LsTree.file i let tf = Git.LsTree.file i
Database.Keys.runWriter $ liftIO $ Database.Keys.SQL.addAssociatedFileFast k tf dbh
liftIO . Database.Keys.SQL.addAssociatedFileFast k tf
whenM (pure isregfile <&&> inAnnex k) $ do whenM (pure isregfile <&&> inAnnex k) $ do
f <- fromRepo $ fromTopFilePath tf f <- fromRepo $ fromTopFilePath tf
liftIO (isPointerFile f) >>= \case liftIO (isPointerFile f) >>= \case

View file

@ -23,6 +23,7 @@ module Database.Keys (
removeInodeCache, removeInodeCache,
isInodeKnown, isInodeKnown,
runWriter, runWriter,
runWriter',
) where ) where
import qualified Database.Keys.SQL as SQL import qualified Database.Keys.SQL as SQL
@ -73,7 +74,7 @@ runReader a = do
v <- a (SQL.ReadHandle qh) v <- a (SQL.ReadHandle qh)
return (v, st) return (v, st)
go DbClosed = do go DbClosed = do
st' <- openDb True DbClosed st' <- openDb True Nothing DbClosed
v <- case st' of v <- case st' of
(DbOpen qh) -> a (SQL.ReadHandle qh) (DbOpen qh) -> a (SQL.ReadHandle qh)
_ -> return mempty _ -> return mempty
@ -87,7 +88,15 @@ runReaderIO a = runReader (liftIO . a)
- -
- The database is created if it doesn't exist yet. -} - The database is created if it doesn't exist yet. -}
runWriter :: (SQL.WriteHandle -> Annex ()) -> Annex () runWriter :: (SQL.WriteHandle -> Annex ()) -> Annex ()
runWriter a = do runWriter = runWriter' Nothing
runWriterIO :: (SQL.WriteHandle -> IO ()) -> Annex ()
runWriterIO a = runWriter (liftIO . a)
{- When a reconcile action is passed, it is run by reconcileStaged instead
- of its usual scan, and must update the database in the same way. -}
runWriter' :: Maybe (SQL.WriteHandle -> Annex ()) -> (SQL.WriteHandle -> Annex ()) -> Annex ()
runWriter' reconciler a = do
h <- Annex.getRead Annex.keysdbhandle h <- Annex.getRead Annex.keysdbhandle
withDbState h go withDbState h go
where where
@ -95,15 +104,12 @@ runWriter a = do
v <- a (SQL.WriteHandle qh) v <- a (SQL.WriteHandle qh)
return (v, st) return (v, st)
go st = do go st = do
st' <- openDb False st st' <- openDb False reconciler st
v <- case st' of v <- case st' of
DbOpen qh -> a (SQL.WriteHandle qh) DbOpen qh -> a (SQL.WriteHandle qh)
_ -> error "internal" _ -> error "internal"
return (v, st') return (v, st')
runWriterIO :: (SQL.WriteHandle -> IO ()) -> Annex ()
runWriterIO a = runWriter (liftIO . a)
{- Opens the database, creating it if it doesn't exist yet. {- Opens the database, creating it if it doesn't exist yet.
- -
- Multiple readers and writers can have the database open at the same - Multiple readers and writers can have the database open at the same
@ -112,10 +118,10 @@ runWriterIO a = runWriter (liftIO . a)
- the database doesn't exist yet, one caller wins the lock and - the database doesn't exist yet, one caller wins the lock and
- can create it undisturbed. - can create it undisturbed.
-} -}
openDb :: Bool -> DbState -> Annex DbState openDb :: Bool -> (Maybe (SQL.WriteHandle -> Annex ())) -> DbState -> Annex DbState
openDb _ st@(DbOpen _) = return st openDb _ _ st@(DbOpen _) = return st
openDb False DbUnavailable = return DbUnavailable openDb False _ DbUnavailable = return DbUnavailable
openDb forwrite _ = catchPermissionDenied permerr $ withExclusiveLock gitAnnexKeysDbLock $ do openDb forwrite reconciler _ = catchPermissionDenied permerr $ withExclusiveLock gitAnnexKeysDbLock $ do
dbdir <- fromRepo gitAnnexKeysDb dbdir <- fromRepo gitAnnexKeysDb
let db = dbdir P.</> "db" let db = dbdir P.</> "db"
dbexists <- liftIO $ R.doesPathExist db dbexists <- liftIO $ R.doesPathExist db
@ -133,7 +139,7 @@ openDb forwrite _ = catchPermissionDenied permerr $ withExclusiveLock gitAnnexKe
open db = do open db = do
qh <- liftIO $ H.openDbQueue H.MultiWriter db SQL.containedTable qh <- liftIO $ H.openDbQueue H.MultiWriter db SQL.containedTable
reconcileStaged qh reconcileStaged qh reconciler
return $ DbOpen qh return $ DbOpen qh
{- Closes the database if it was open. Any writes will be flushed to it. {- Closes the database if it was open. Any writes will be flushed to it.
@ -223,8 +229,8 @@ isInodeKnown i s = or <$> runReaderIO ((:[]) <$$> SQL.isInodeKnown i s)
- So when using getAssociatedFiles, have to make sure the file still - So when using getAssociatedFiles, have to make sure the file still
- is an associated file. - is an associated file.
-} -}
reconcileStaged :: H.DbQueue -> Annex () reconcileStaged :: H.DbQueue -> Maybe (SQL.WriteHandle -> Annex ()) -> Annex ()
reconcileStaged qh = do reconcileStaged qh mreconciler = do
gitindex <- inRepo currentIndexFile gitindex <- inRepo currentIndexFile
indexcache <- fromRawFilePath <$> fromRepo gitAnnexKeysDbIndexCache indexcache <- fromRawFilePath <$> fromRepo gitAnnexKeysDbIndexCache
withTSDelta (liftIO . genInodeCache gitindex) >>= \case withTSDelta (liftIO . genInodeCache gitindex) >>= \case
@ -246,12 +252,21 @@ reconcileStaged qh = do
go cur indexcache (Just newtree) = do go cur indexcache (Just newtree) = do
oldtree <- getoldtree oldtree <- getoldtree
when (oldtree /= newtree) $ do when (oldtree /= newtree) $ do
case mreconciler of
Just reconciler ->
reconciler (SQL.WriteHandle qh)
Nothing -> noop
g <- Annex.gitRepo g <- Annex.gitRepo
void $ catstream $ \mdfeeder -> void $ catstream $ \mdfeeder -> do
void $ updatetodiff g case mreconciler of
(Just (fromRef oldtree)) Nothing -> void $ updatetodiff g
(fromRef newtree) (Just (fromRef oldtree))
(procdiff mdfeeder) (fromRef newtree)
(procdiff mdfeeder)
Just _ -> void $ updatetodiff g
Nothing "--staged"
(procdiff mdfeeder)
liftIO $ writeFile indexcache $ showInodeCache cur liftIO $ writeFile indexcache $ showInodeCache cur
-- Storing the tree in a ref makes sure it does not -- Storing the tree in a ref makes sure it does not
-- get garbage collected, and is available to diff -- get garbage collected, and is available to diff

View file

@ -0,0 +1,14 @@
[[!comment format=mdwn
username="joey"
subject="""comment 4"""
date="2021-06-07T20:22:24Z"
content="""
Turns out `git-annex init` got a lot slower than it had to, it was doing
the same kind of scan twice. I think that probably explains much of the
slowdown you saw. (Although it still needs to do somewhat more work than
before, but also does it more efficiently than before.)
Also I've optimised the update scan's use of git, around a 20% speedup,
although I don't know if your test case speed was impacted much by that
scan anyway.
"""]]

View file

@ -0,0 +1,9 @@
[[!comment format=mdwn
username="joey"
subject="""comment 5"""
date="2021-06-07T20:42:34Z"
content="""
Looking at the CI log, I don't see any past runs that took 1h 46min.
A month ago they were taking 2h 6min. Let's see if the changes I'm pushing
now drop it back to that.
"""]]

View file

@ -0,0 +1,16 @@
[[!comment format=mdwn
username="joey"
subject="""comment 20"""
date="2021-06-07T19:22:03Z"
content="""
Turns out git-annex init was running both scanAnnexedFiles and
reconcileStaged, which after recent changes to the latter, both do
close to the same scan when run in a fresh clone. So double work!
Benchmarking with 100,000 files, git-annex init took 88 seconds.
Fixed not to use reconcileStaged it took 37 seconds.
(Keeping reconcileStaged and removing scanAnnexedFiles it took 47 seconds.
That makes sense; reconcileStaged is an incremental updater and is not
able to use SQL as efficiently as scanAnnexedFiles.)
"""]]