Sped up the initial scanning for annexed files by 15%

Avoids database querying overhead when the database is newly created. In the large repository where git-annex init took 24 seconds, this sped it up to 20.47 seconds, a speedup of around 15%. Sponsored-by: Dartmouth College's DANDI project
2022-11-18 13:16:57 -04:00 · 2022-11-18 13:16:57 -04:00 · 8fcee4ac9d
commit 8fcee4ac9d
parent a3e9a0ae27
4 changed files with 32 additions and 8 deletions
--- a/1
+++ b/1
@ -1,6 +1,7 @@
 git-annex (10.20221105) UNRELEASED; urgency=medium
  * Support quettabyte and yottabyte.
  * Sped up the initial scanning for annexed files by 15%.
 -- Joey Hess <id@joeyh.name>  Fri, 18 Nov 2022 12:58:06 -0400
--- a/Database/Keys.hs
+++ b/Database/Keys.hs
@ -132,10 +132,10 @@ openDb forwrite _ = do
 		let db = dbdir P.</> "db"
 		dbexists <- liftIO $ R.doesPathExist db
 		case dbexists of
-			True -> open db
+			True -> open db False
 			False -> do
 				initDb db SQL.createTables
-				open db
+				open db True
  where
 	-- If permissions don't allow opening the database, and it's being
 	-- opened for read, treat it as if it does not exist.
@ -143,9 +143,9 @@ openDb forwrite _ = do
 		| forwrite = throwM e
 		| otherwise = return DbUnavailable
-	open db = do
+	open db dbisnew = do
 		qh <- liftIO $ H.openDbQueue db SQL.containedTable
-		tc <- reconcileStaged qh
+		tc <- reconcileStaged dbisnew qh
 		return $ DbOpen (qh, tc)
 {- Closes the database if it was open. Any writes will be flushed to it.
@ -260,8 +260,8 @@ isInodeKnown i s = or <$> runReaderIO ContentTable
 - So when using getAssociatedFiles, have to make sure the file still
 - is an associated file.
 -}
-reconcileStaged :: H.DbQueue -> Annex DbTablesChanged
+reconcileStaged :: Bool -> H.DbQueue -> Annex DbTablesChanged
-reconcileStaged qh = ifM (Git.Config.isBare <$> gitRepo)
+reconcileStaged dbisnew qh = ifM (Git.Config.isBare <$> gitRepo)
 	( return mempty
 	, do
 		gitindex <- inRepo currentIndexFile
@ -384,7 +384,7 @@ reconcileStaged qh = ifM (Git.Config.isBare <$> gitRepo)
 						Nothing -> return False
 				send mdfeeder (Ref dstsha) $ \case
 					Just key -> do
-						liftIO $ SQL.addAssociatedFile key
+						liftIO $ addassociatedfile key
 							(asTopFilePath file)
 							(SQL.WriteHandle qh)
 						when (dstmode /= fmtTreeItemType TreeSymlink) $
@ -497,6 +497,18 @@ reconcileStaged qh = ifM (Git.Config.isBare <$> gitRepo)
 	largediff :: Int
 	largediff = 1000
 	-- When the database is known to have been newly created and empty
 	-- before reconcileStaged started, it is more efficient to use 
 	-- newAssociatedFile. It's safe to use it here because this is run
 	-- with a lock held that blocks any other process that opens the
 	-- database, and when the database is newly created, there is no
 	-- existing process that has it open already. And it's not possible
 	-- for reconcileStaged to call this twice on the same filename with
 	-- two different keys.
 	addassociatedfile
 		| dbisnew = SQL.newAssociatedFile
 		| otherwise = SQL.addAssociatedFile
 {- Normally the keys database is updated incrementally when opened,
 - by reconcileStaged. Calling this explicitly allows running the
 - update at an earlier point.
--- a/Database/Keys/SQL.hs
+++ b/Database/Keys/SQL.hs
@ -1,6 +1,6 @@
 {- Sqlite database of information about Keys
 -
- - Copyright 2015-2021 Joey Hess <id@joeyh.name>
+ - Copyright 2015-2022 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU AGPL version 3 or higher.
 -}
@ -88,6 +88,15 @@ addAssociatedFile k f = queueDb $
  where
 	af = SFilePath (getTopFilePath f)
 -- Faster than addAssociatedFile, but only safe to use when the file
 -- was not associated with a different key before, as it does not delete
 -- any old key.
 newAssociatedFile :: Key -> TopFilePath -> WriteHandle -> IO ()
 newAssociatedFile k f = queueDb $
 	void $ insert $ Associated k af
  where
 	af = SFilePath (getTopFilePath f)
 {- Note that the files returned were once associated with the key, but
 - some of them may not be any longer. -}
 getAssociatedFiles :: Key -> ReadHandle -> IO [TopFilePath]
--- a/doc/bugs/performance_regression63_init_takes_times_more/comment_13_a79fcbe80060d11582989a9fc31d4a92._comment
+++ b/doc/bugs/performance_regression63_init_takes_times_more/comment_13_a79fcbe80060d11582989a9fc31d4a92._comment
@ -12,4 +12,6 @@ This will need some care to be implemented safely...
 I benchmarked it, and using insertUnique is no faster, but using insert is.
 This would be a 15% speed up.
 Update: Implemented this optimisation.
 """]]