Sped up the initial scanning for annexed files by 15%

Avoids database querying overhead when the database is newly created.

In the large repository where git-annex init took 24 seconds, this sped it
up to 20.47 seconds, a speedup of around 15%.

Sponsored-by: Dartmouth College's DANDI project
This commit is contained in:
Joey Hess 2022-11-18 13:16:57 -04:00
parent a3e9a0ae27
commit 8fcee4ac9d
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
4 changed files with 32 additions and 8 deletions

View file

@ -1,6 +1,7 @@
git-annex (10.20221105) UNRELEASED; urgency=medium git-annex (10.20221105) UNRELEASED; urgency=medium
* Support quettabyte and yottabyte. * Support quettabyte and yottabyte.
* Sped up the initial scanning for annexed files by 15%.
-- Joey Hess <id@joeyh.name> Fri, 18 Nov 2022 12:58:06 -0400 -- Joey Hess <id@joeyh.name> Fri, 18 Nov 2022 12:58:06 -0400

View file

@ -132,10 +132,10 @@ openDb forwrite _ = do
let db = dbdir P.</> "db" let db = dbdir P.</> "db"
dbexists <- liftIO $ R.doesPathExist db dbexists <- liftIO $ R.doesPathExist db
case dbexists of case dbexists of
True -> open db True -> open db False
False -> do False -> do
initDb db SQL.createTables initDb db SQL.createTables
open db open db True
where where
-- If permissions don't allow opening the database, and it's being -- If permissions don't allow opening the database, and it's being
-- opened for read, treat it as if it does not exist. -- opened for read, treat it as if it does not exist.
@ -143,9 +143,9 @@ openDb forwrite _ = do
| forwrite = throwM e | forwrite = throwM e
| otherwise = return DbUnavailable | otherwise = return DbUnavailable
open db = do open db dbisnew = do
qh <- liftIO $ H.openDbQueue db SQL.containedTable qh <- liftIO $ H.openDbQueue db SQL.containedTable
tc <- reconcileStaged qh tc <- reconcileStaged dbisnew qh
return $ DbOpen (qh, tc) return $ DbOpen (qh, tc)
{- Closes the database if it was open. Any writes will be flushed to it. {- Closes the database if it was open. Any writes will be flushed to it.
@ -260,8 +260,8 @@ isInodeKnown i s = or <$> runReaderIO ContentTable
- So when using getAssociatedFiles, have to make sure the file still - So when using getAssociatedFiles, have to make sure the file still
- is an associated file. - is an associated file.
-} -}
reconcileStaged :: H.DbQueue -> Annex DbTablesChanged reconcileStaged :: Bool -> H.DbQueue -> Annex DbTablesChanged
reconcileStaged qh = ifM (Git.Config.isBare <$> gitRepo) reconcileStaged dbisnew qh = ifM (Git.Config.isBare <$> gitRepo)
( return mempty ( return mempty
, do , do
gitindex <- inRepo currentIndexFile gitindex <- inRepo currentIndexFile
@ -384,7 +384,7 @@ reconcileStaged qh = ifM (Git.Config.isBare <$> gitRepo)
Nothing -> return False Nothing -> return False
send mdfeeder (Ref dstsha) $ \case send mdfeeder (Ref dstsha) $ \case
Just key -> do Just key -> do
liftIO $ SQL.addAssociatedFile key liftIO $ addassociatedfile key
(asTopFilePath file) (asTopFilePath file)
(SQL.WriteHandle qh) (SQL.WriteHandle qh)
when (dstmode /= fmtTreeItemType TreeSymlink) $ when (dstmode /= fmtTreeItemType TreeSymlink) $
@ -497,6 +497,18 @@ reconcileStaged qh = ifM (Git.Config.isBare <$> gitRepo)
largediff :: Int largediff :: Int
largediff = 1000 largediff = 1000
-- When the database is known to have been newly created and empty
-- before reconcileStaged started, it is more efficient to use
-- newAssociatedFile. It's safe to use it here because this is run
-- with a lock held that blocks any other process that opens the
-- database, and when the database is newly created, there is no
-- existing process that has it open already. And it's not possible
-- for reconcileStaged to call this twice on the same filename with
-- two different keys.
addassociatedfile
| dbisnew = SQL.newAssociatedFile
| otherwise = SQL.addAssociatedFile
{- Normally the keys database is updated incrementally when opened, {- Normally the keys database is updated incrementally when opened,
- by reconcileStaged. Calling this explicitly allows running the - by reconcileStaged. Calling this explicitly allows running the
- update at an earlier point. - update at an earlier point.

View file

@ -1,6 +1,6 @@
{- Sqlite database of information about Keys {- Sqlite database of information about Keys
- -
- Copyright 2015-2021 Joey Hess <id@joeyh.name> - Copyright 2015-2022 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -88,6 +88,15 @@ addAssociatedFile k f = queueDb $
where where
af = SFilePath (getTopFilePath f) af = SFilePath (getTopFilePath f)
-- Faster than addAssociatedFile, but only safe to use when the file
-- was not associated with a different key before, as it does not delete
-- any old key.
newAssociatedFile :: Key -> TopFilePath -> WriteHandle -> IO ()
newAssociatedFile k f = queueDb $
void $ insert $ Associated k af
where
af = SFilePath (getTopFilePath f)
{- Note that the files returned were once associated with the key, but {- Note that the files returned were once associated with the key, but
- some of them may not be any longer. -} - some of them may not be any longer. -}
getAssociatedFiles :: Key -> ReadHandle -> IO [TopFilePath] getAssociatedFiles :: Key -> ReadHandle -> IO [TopFilePath]

View file

@ -12,4 +12,6 @@ This will need some care to be implemented safely...
I benchmarked it, and using insertUnique is no faster, but using insert is. I benchmarked it, and using insertUnique is no faster, but using insert is.
This would be a 15% speed up. This would be a 15% speed up.
Update: Implemented this optimisation.
"""]] """]]