avoid double work in git-annex init
reconcileStaged was doing a redundant scan to scannAnnexedFiles. It would probably make sense to move the body of scannAnnexedFiles into reconcileStaged, the separation does not really serve any purpose. Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
parent
6ceb31a30a
commit
0f10f208a7
5 changed files with 93 additions and 38 deletions
|
@ -70,27 +70,29 @@ ifAnnexed file yes no = maybe no yes =<< lookupKey file
|
||||||
|
|
||||||
{- Find all annexed files and update the keys database for them.
|
{- Find all annexed files and update the keys database for them.
|
||||||
-
|
-
|
||||||
- This is expensive, and so normally the associated files are updated
|
|
||||||
- incrementally when changes are noticed. So, this only needs to be done
|
|
||||||
- when initializing/upgrading a repository.
|
|
||||||
-
|
|
||||||
- Also, the content for an unlocked file may already be present as
|
- Also, the content for an unlocked file may already be present as
|
||||||
- an annex object. If so, populate the pointer file with it.
|
- an annex object. If so, populate the pointer file with it.
|
||||||
- But if worktree file does not have a pointer file's content, it is left
|
- But if worktree file does not have a pointer file's content, it is left
|
||||||
- as-is.
|
- as-is.
|
||||||
|
-
|
||||||
|
- Normally the keys database is updated incrementally when changes are
|
||||||
|
- noticed. For an initial scan, this is faster than that incremental
|
||||||
|
- update.
|
||||||
-}
|
-}
|
||||||
scanAnnexedFiles :: Annex ()
|
scanAnnexedFiles :: Annex ()
|
||||||
scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ do
|
scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $
|
||||||
g <- Annex.gitRepo
|
Database.Keys.runWriter' (Just reconciler) (const noop)
|
||||||
Database.Keys.runWriter $
|
|
||||||
liftIO . Database.Keys.SQL.dropAllAssociatedFiles
|
|
||||||
(l, cleanup) <- inRepo $ Git.LsTree.lsTree
|
|
||||||
Git.LsTree.LsTreeRecursive
|
|
||||||
(Git.LsTree.LsTreeLong True)
|
|
||||||
Git.Ref.headRef
|
|
||||||
catObjectStreamLsTree l want g go
|
|
||||||
liftIO $ void cleanup
|
|
||||||
where
|
where
|
||||||
|
reconciler dbh = do
|
||||||
|
g <- Annex.gitRepo
|
||||||
|
liftIO $ Database.Keys.SQL.dropAllAssociatedFiles dbh
|
||||||
|
(l, cleanup) <- inRepo $ Git.LsTree.lsTree
|
||||||
|
Git.LsTree.LsTreeRecursive
|
||||||
|
(Git.LsTree.LsTreeLong True)
|
||||||
|
Git.Ref.headRef
|
||||||
|
catObjectStreamLsTree l want g (go dbh)
|
||||||
|
liftIO $ void cleanup
|
||||||
|
|
||||||
-- Want to process symlinks, and regular files.
|
-- Want to process symlinks, and regular files.
|
||||||
want i = case Git.Types.toTreeItemType (Git.LsTree.mode i) of
|
want i = case Git.Types.toTreeItemType (Git.LsTree.mode i) of
|
||||||
Just Git.Types.TreeSymlink -> Just (i, False)
|
Just Git.Types.TreeSymlink -> Just (i, False)
|
||||||
|
@ -103,17 +105,16 @@ scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ d
|
||||||
Just n | n < maxPointerSz -> Just (i, True)
|
Just n | n < maxPointerSz -> Just (i, True)
|
||||||
_ -> Nothing
|
_ -> Nothing
|
||||||
|
|
||||||
go getnext = liftIO getnext >>= \case
|
go dbh getnext = liftIO getnext >>= \case
|
||||||
Just ((i, isregfile), Just c) -> do
|
Just ((i, isregfile), Just c) -> do
|
||||||
maybe noop (add i isregfile)
|
maybe noop (add i isregfile dbh)
|
||||||
(parseLinkTargetOrPointer (L.toStrict c))
|
(parseLinkTargetOrPointer (L.toStrict c))
|
||||||
go getnext
|
go dbh getnext
|
||||||
_ -> return ()
|
_ -> return ()
|
||||||
|
|
||||||
add i isregfile k = do
|
add i isregfile dbh k = do
|
||||||
let tf = Git.LsTree.file i
|
let tf = Git.LsTree.file i
|
||||||
Database.Keys.runWriter $
|
liftIO $ Database.Keys.SQL.addAssociatedFileFast k tf dbh
|
||||||
liftIO . Database.Keys.SQL.addAssociatedFileFast k tf
|
|
||||||
whenM (pure isregfile <&&> inAnnex k) $ do
|
whenM (pure isregfile <&&> inAnnex k) $ do
|
||||||
f <- fromRepo $ fromTopFilePath tf
|
f <- fromRepo $ fromTopFilePath tf
|
||||||
liftIO (isPointerFile f) >>= \case
|
liftIO (isPointerFile f) >>= \case
|
||||||
|
|
|
@ -23,6 +23,7 @@ module Database.Keys (
|
||||||
removeInodeCache,
|
removeInodeCache,
|
||||||
isInodeKnown,
|
isInodeKnown,
|
||||||
runWriter,
|
runWriter,
|
||||||
|
runWriter',
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import qualified Database.Keys.SQL as SQL
|
import qualified Database.Keys.SQL as SQL
|
||||||
|
@ -73,7 +74,7 @@ runReader a = do
|
||||||
v <- a (SQL.ReadHandle qh)
|
v <- a (SQL.ReadHandle qh)
|
||||||
return (v, st)
|
return (v, st)
|
||||||
go DbClosed = do
|
go DbClosed = do
|
||||||
st' <- openDb True DbClosed
|
st' <- openDb True Nothing DbClosed
|
||||||
v <- case st' of
|
v <- case st' of
|
||||||
(DbOpen qh) -> a (SQL.ReadHandle qh)
|
(DbOpen qh) -> a (SQL.ReadHandle qh)
|
||||||
_ -> return mempty
|
_ -> return mempty
|
||||||
|
@ -87,7 +88,15 @@ runReaderIO a = runReader (liftIO . a)
|
||||||
-
|
-
|
||||||
- The database is created if it doesn't exist yet. -}
|
- The database is created if it doesn't exist yet. -}
|
||||||
runWriter :: (SQL.WriteHandle -> Annex ()) -> Annex ()
|
runWriter :: (SQL.WriteHandle -> Annex ()) -> Annex ()
|
||||||
runWriter a = do
|
runWriter = runWriter' Nothing
|
||||||
|
|
||||||
|
runWriterIO :: (SQL.WriteHandle -> IO ()) -> Annex ()
|
||||||
|
runWriterIO a = runWriter (liftIO . a)
|
||||||
|
|
||||||
|
{- When a reconcile action is passed, it is run by reconcileStaged instead
|
||||||
|
- of its usual scan, and must update the database in the same way. -}
|
||||||
|
runWriter' :: Maybe (SQL.WriteHandle -> Annex ()) -> (SQL.WriteHandle -> Annex ()) -> Annex ()
|
||||||
|
runWriter' reconciler a = do
|
||||||
h <- Annex.getRead Annex.keysdbhandle
|
h <- Annex.getRead Annex.keysdbhandle
|
||||||
withDbState h go
|
withDbState h go
|
||||||
where
|
where
|
||||||
|
@ -95,15 +104,12 @@ runWriter a = do
|
||||||
v <- a (SQL.WriteHandle qh)
|
v <- a (SQL.WriteHandle qh)
|
||||||
return (v, st)
|
return (v, st)
|
||||||
go st = do
|
go st = do
|
||||||
st' <- openDb False st
|
st' <- openDb False reconciler st
|
||||||
v <- case st' of
|
v <- case st' of
|
||||||
DbOpen qh -> a (SQL.WriteHandle qh)
|
DbOpen qh -> a (SQL.WriteHandle qh)
|
||||||
_ -> error "internal"
|
_ -> error "internal"
|
||||||
return (v, st')
|
return (v, st')
|
||||||
|
|
||||||
runWriterIO :: (SQL.WriteHandle -> IO ()) -> Annex ()
|
|
||||||
runWriterIO a = runWriter (liftIO . a)
|
|
||||||
|
|
||||||
{- Opens the database, creating it if it doesn't exist yet.
|
{- Opens the database, creating it if it doesn't exist yet.
|
||||||
-
|
-
|
||||||
- Multiple readers and writers can have the database open at the same
|
- Multiple readers and writers can have the database open at the same
|
||||||
|
@ -112,10 +118,10 @@ runWriterIO a = runWriter (liftIO . a)
|
||||||
- the database doesn't exist yet, one caller wins the lock and
|
- the database doesn't exist yet, one caller wins the lock and
|
||||||
- can create it undisturbed.
|
- can create it undisturbed.
|
||||||
-}
|
-}
|
||||||
openDb :: Bool -> DbState -> Annex DbState
|
openDb :: Bool -> (Maybe (SQL.WriteHandle -> Annex ())) -> DbState -> Annex DbState
|
||||||
openDb _ st@(DbOpen _) = return st
|
openDb _ _ st@(DbOpen _) = return st
|
||||||
openDb False DbUnavailable = return DbUnavailable
|
openDb False _ DbUnavailable = return DbUnavailable
|
||||||
openDb forwrite _ = catchPermissionDenied permerr $ withExclusiveLock gitAnnexKeysDbLock $ do
|
openDb forwrite reconciler _ = catchPermissionDenied permerr $ withExclusiveLock gitAnnexKeysDbLock $ do
|
||||||
dbdir <- fromRepo gitAnnexKeysDb
|
dbdir <- fromRepo gitAnnexKeysDb
|
||||||
let db = dbdir P.</> "db"
|
let db = dbdir P.</> "db"
|
||||||
dbexists <- liftIO $ R.doesPathExist db
|
dbexists <- liftIO $ R.doesPathExist db
|
||||||
|
@ -133,7 +139,7 @@ openDb forwrite _ = catchPermissionDenied permerr $ withExclusiveLock gitAnnexKe
|
||||||
|
|
||||||
open db = do
|
open db = do
|
||||||
qh <- liftIO $ H.openDbQueue H.MultiWriter db SQL.containedTable
|
qh <- liftIO $ H.openDbQueue H.MultiWriter db SQL.containedTable
|
||||||
reconcileStaged qh
|
reconcileStaged qh reconciler
|
||||||
return $ DbOpen qh
|
return $ DbOpen qh
|
||||||
|
|
||||||
{- Closes the database if it was open. Any writes will be flushed to it.
|
{- Closes the database if it was open. Any writes will be flushed to it.
|
||||||
|
@ -223,8 +229,8 @@ isInodeKnown i s = or <$> runReaderIO ((:[]) <$$> SQL.isInodeKnown i s)
|
||||||
- So when using getAssociatedFiles, have to make sure the file still
|
- So when using getAssociatedFiles, have to make sure the file still
|
||||||
- is an associated file.
|
- is an associated file.
|
||||||
-}
|
-}
|
||||||
reconcileStaged :: H.DbQueue -> Annex ()
|
reconcileStaged :: H.DbQueue -> Maybe (SQL.WriteHandle -> Annex ()) -> Annex ()
|
||||||
reconcileStaged qh = do
|
reconcileStaged qh mreconciler = do
|
||||||
gitindex <- inRepo currentIndexFile
|
gitindex <- inRepo currentIndexFile
|
||||||
indexcache <- fromRawFilePath <$> fromRepo gitAnnexKeysDbIndexCache
|
indexcache <- fromRawFilePath <$> fromRepo gitAnnexKeysDbIndexCache
|
||||||
withTSDelta (liftIO . genInodeCache gitindex) >>= \case
|
withTSDelta (liftIO . genInodeCache gitindex) >>= \case
|
||||||
|
@ -246,12 +252,21 @@ reconcileStaged qh = do
|
||||||
go cur indexcache (Just newtree) = do
|
go cur indexcache (Just newtree) = do
|
||||||
oldtree <- getoldtree
|
oldtree <- getoldtree
|
||||||
when (oldtree /= newtree) $ do
|
when (oldtree /= newtree) $ do
|
||||||
|
case mreconciler of
|
||||||
|
Just reconciler ->
|
||||||
|
reconciler (SQL.WriteHandle qh)
|
||||||
|
Nothing -> noop
|
||||||
g <- Annex.gitRepo
|
g <- Annex.gitRepo
|
||||||
void $ catstream $ \mdfeeder ->
|
void $ catstream $ \mdfeeder -> do
|
||||||
void $ updatetodiff g
|
case mreconciler of
|
||||||
(Just (fromRef oldtree))
|
Nothing -> void $ updatetodiff g
|
||||||
(fromRef newtree)
|
(Just (fromRef oldtree))
|
||||||
(procdiff mdfeeder)
|
(fromRef newtree)
|
||||||
|
(procdiff mdfeeder)
|
||||||
|
Just _ -> void $ updatetodiff g
|
||||||
|
Nothing "--staged"
|
||||||
|
(procdiff mdfeeder)
|
||||||
|
|
||||||
liftIO $ writeFile indexcache $ showInodeCache cur
|
liftIO $ writeFile indexcache $ showInodeCache cur
|
||||||
-- Storing the tree in a ref makes sure it does not
|
-- Storing the tree in a ref makes sure it does not
|
||||||
-- get garbage collected, and is available to diff
|
-- get garbage collected, and is available to diff
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
[[!comment format=mdwn
|
||||||
|
username="joey"
|
||||||
|
subject="""comment 4"""
|
||||||
|
date="2021-06-07T20:22:24Z"
|
||||||
|
content="""
|
||||||
|
Turns out `git-annex init` got a lot slower than it had to, it was doing
|
||||||
|
the same kind of scan twice. I think that probably explains much of the
|
||||||
|
slowdown you saw. (Although it still needs to do somewhat more work than
|
||||||
|
before, but also does it more efficiently than before.)
|
||||||
|
|
||||||
|
Also I've optimised the update scan's use of git, around a 20% speedup,
|
||||||
|
although I don't know if your test case speed was impacted much by that
|
||||||
|
scan anyway.
|
||||||
|
"""]]
|
|
@ -0,0 +1,9 @@
|
||||||
|
[[!comment format=mdwn
|
||||||
|
username="joey"
|
||||||
|
subject="""comment 5"""
|
||||||
|
date="2021-06-07T20:42:34Z"
|
||||||
|
content="""
|
||||||
|
Looking at the CI log, I don't see any past runs that took 1h 46min.
|
||||||
|
A month ago they were taking 2h 6min. Let's see if the changes I'm pushing
|
||||||
|
now drop it back to that.
|
||||||
|
"""]]
|
|
@ -0,0 +1,16 @@
|
||||||
|
[[!comment format=mdwn
|
||||||
|
username="joey"
|
||||||
|
subject="""comment 20"""
|
||||||
|
date="2021-06-07T19:22:03Z"
|
||||||
|
content="""
|
||||||
|
Turns out git-annex init was running both scanAnnexedFiles and
|
||||||
|
reconcileStaged, which after recent changes to the latter, both do
|
||||||
|
close to the same scan when run in a fresh clone. So double work!
|
||||||
|
|
||||||
|
Benchmarking with 100,000 files, git-annex init took 88 seconds.
|
||||||
|
Fixed not to use reconcileStaged it took 37 seconds.
|
||||||
|
|
||||||
|
(Keeping reconcileStaged and removing scanAnnexedFiles it took 47 seconds.
|
||||||
|
That makes sense; reconcileStaged is an incremental updater and is not
|
||||||
|
able to use SQL as efficiently as scanAnnexedFiles.)
|
||||||
|
"""]]
|
Loading…
Add table
Add a link
Reference in a new issue