From fe1b2dfb4bd0554270d1bad8511b64e782513918 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 2 Jun 2023 13:30:30 -0400 Subject: [PATCH] speed up very first tree import by 25% Reading from the cidsdb is responsible for about 25% of the runtime of an import. Since the cidmap is used to store the same information in ram, the cidsdb is not written to during an import any longer. And so, if it started off empty (and updateFromLog wasn't needed), those reads can just be skipped. This is kind of a cheesy optimisation, since after any import from any special remote, the database will no longer be empty, so it's a single use optimisation. But it's probably not uncommon to start by importing a lot of files, and it can save a lot of time then. Sponsored-by: Brock Spratlen on Patreon --- Annex/Import.hs | 20 +++++++++++++------- CHANGELOG | 3 +-- Database/ContentIdentifier.hs | 35 ++++++++++++++++++++--------------- Remote/Helper/ExportImport.hs | 8 ++++---- 4 files changed, 38 insertions(+), 28 deletions(-) diff --git a/Annex/Import.hs b/Annex/Import.hs index 074b7e3b69..de98ba190b 100644 --- a/Annex/Import.hs +++ b/Annex/Import.hs @@ -626,9 +626,9 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec -- avoid two threads both importing the same content identifier. importing <- liftIO $ newTVarIO S.empty withciddb $ \db -> do - CIDDb.needsUpdateFromLog db - >>= maybe noop (CIDDb.updateFromLog db) - (prepclock (run cidmap importing db)) + db' <- CIDDb.needsUpdateFromLog db + >>= maybe (pure db) (CIDDb.updateFromLog db) + (prepclock (run cidmap importing db')) where -- When not importing content, reuse the same vector -- clock for all state that's recorded. This can save @@ -925,10 +925,16 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec getTopFilePath subdir P. fromImportLocation loc getcidkey cidmap db cid = liftIO $ - CIDDb.getContentIdentifierKeys db rs cid >>= \case - [] -> atomically $ - maybeToList . M.lookup cid <$> readTVar cidmap - l -> return l + -- Avoiding querying the database when it's empty speeds up + -- the initial import. + if CIDDb.databaseIsEmpty db + then getcidkeymap cidmap cid + else CIDDb.getContentIdentifierKeys db rs cid >>= \case + [] -> getcidkeymap cidmap cid + l -> return l + + getcidkeymap cidmap cid = + atomically $ maybeToList . M.lookup cid <$> readTVar cidmap recordcidkey cidmap cid k = do liftIO $ atomically $ modifyTVar' cidmap $ diff --git a/CHANGELOG b/CHANGELOG index 66c77e6857..c89ab86e64 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -77,8 +77,7 @@ git-annex (10.20230408) UNRELEASED; urgency=medium * repair: Fix handling of git ref names on Windows. * Large speed up to importing trees from special remotes that contain a lot of files, by only processing changed files. - * Speed up importing trees from special remotes somewhat by avoiding - redundant writes to sqlite database. + * Some other speedups to importing trees from special remotes. -- Joey Hess Sat, 08 Apr 2023 13:57:18 -0400 diff --git a/Database/ContentIdentifier.hs b/Database/ContentIdentifier.hs index 4cee459d3f..7eb1bd0d58 100644 --- a/Database/ContentIdentifier.hs +++ b/Database/ContentIdentifier.hs @@ -1,6 +1,6 @@ {- Sqlite database of ContentIdentifiers imported from special remotes. - - - Copyright 2019 Joey Hess + - Copyright 2019-2023 Joey Hess -: - Licensed under the GNU AGPL version 3 or higher. -} @@ -19,6 +19,7 @@ module Database.ContentIdentifier ( ContentIdentifierHandle, + databaseIsEmpty, openDb, closeDb, flushDbQueue, @@ -57,7 +58,10 @@ import Database.Persist.Sqlite (runSqlite) import qualified System.FilePath.ByteString as P import qualified Data.Text as T -data ContentIdentifierHandle = ContentIdentifierHandle H.DbQueue +data ContentIdentifierHandle = ContentIdentifierHandle H.DbQueue Bool + +databaseIsEmpty :: ContentIdentifierHandle -> Bool +databaseIsEmpty (ContentIdentifierHandle _ b) = b share [mkPersist sqlSettings, mkMigrate "migrateContentIdentifier"] [persistLowerCase| ContentIdentifiers @@ -81,23 +85,23 @@ openDb :: Annex ContentIdentifierHandle openDb = do dbdir <- calcRepo' gitAnnexContentIdentifierDbDir let db = dbdir P. "db" - ifM (liftIO $ not <$> R.doesPathExist db) - ( initDb db $ void $ + isnew <- liftIO $ not <$> R.doesPathExist db + if isnew + then initDb db $ void $ runMigrationSilent migrateContentIdentifier -- Migrate from old version of database, which had -- an incorrect uniqueness constraint on the -- ContentIdentifiers table. - , liftIO $ runSqlite (T.pack (fromRawFilePath db)) $ void $ + else liftIO $ runSqlite (T.pack (fromRawFilePath db)) $ void $ runMigrationSilent migrateContentIdentifier - ) h <- liftIO $ H.openDbQueue db "content_identifiers" - return $ ContentIdentifierHandle h + return $ ContentIdentifierHandle h isnew closeDb :: ContentIdentifierHandle -> Annex () -closeDb (ContentIdentifierHandle h) = liftIO $ H.closeDbQueue h +closeDb (ContentIdentifierHandle h _) = liftIO $ H.closeDbQueue h queueDb :: ContentIdentifierHandle -> SqlPersistM () -> IO () -queueDb (ContentIdentifierHandle h) = H.queueDb h checkcommit +queueDb (ContentIdentifierHandle h _) = H.queueDb h checkcommit where -- commit queue after 1000 changes checkcommit sz _lastcommittime @@ -105,7 +109,7 @@ queueDb (ContentIdentifierHandle h) = H.queueDb h checkcommit | otherwise = return False flushDbQueue :: ContentIdentifierHandle -> IO () -flushDbQueue (ContentIdentifierHandle h) = H.flushDbQueue h +flushDbQueue (ContentIdentifierHandle h _) = H.flushDbQueue h -- Be sure to also update the git-annex branch when using this. recordContentIdentifier :: ContentIdentifierHandle -> RemoteStateHandle -> ContentIdentifier -> Key -> IO () @@ -113,7 +117,7 @@ recordContentIdentifier h (RemoteStateHandle u) cid k = queueDb h $ do void $ insertUniqueFast $ ContentIdentifiers u cid k getContentIdentifiers :: ContentIdentifierHandle -> RemoteStateHandle -> Key -> IO [ContentIdentifier] -getContentIdentifiers (ContentIdentifierHandle h) (RemoteStateHandle u) k = +getContentIdentifiers (ContentIdentifierHandle h _) (RemoteStateHandle u) k = H.queryDbQueue h $ do l <- selectList [ ContentIdentifiersKey ==. k @@ -122,7 +126,7 @@ getContentIdentifiers (ContentIdentifierHandle h) (RemoteStateHandle u) k = return $ map (contentIdentifiersCid . entityVal) l getContentIdentifierKeys :: ContentIdentifierHandle -> RemoteStateHandle -> ContentIdentifier -> IO [Key] -getContentIdentifierKeys (ContentIdentifierHandle h) (RemoteStateHandle u) cid = +getContentIdentifierKeys (ContentIdentifierHandle h _) (RemoteStateHandle u) cid = H.queryDbQueue h $ do l <- selectList [ ContentIdentifiersCid ==. cid @@ -136,7 +140,7 @@ recordAnnexBranchTree h s = queueDb h $ do void $ insertUniqueFast $ AnnexBranch $ toSSha s getAnnexBranchTree :: ContentIdentifierHandle -> IO Sha -getAnnexBranchTree (ContentIdentifierHandle h) = H.queryDbQueue h $ do +getAnnexBranchTree (ContentIdentifierHandle h _) = H.queryDbQueue h $ do l <- selectList ([] :: [Filter AnnexBranch]) [] case l of (s:[]) -> return $ fromSSha $ annexBranchTree $ entityVal s @@ -153,8 +157,8 @@ needsUpdateFromLog db = do _ -> return Nothing {- The database should be locked for write when calling this. -} -updateFromLog :: ContentIdentifierHandle -> (Sha, Sha) -> Annex () -updateFromLog db (oldtree, currtree) = do +updateFromLog :: ContentIdentifierHandle -> (Sha, Sha) -> Annex ContentIdentifierHandle +updateFromLog db@(ContentIdentifierHandle h _) (oldtree, currtree) = do (l, cleanup) <- inRepo $ DiffTree.diffTreeRecursive oldtree currtree mapM_ go l @@ -162,6 +166,7 @@ updateFromLog db (oldtree, currtree) = do liftIO $ do recordAnnexBranchTree db currtree flushDbQueue db + return (ContentIdentifierHandle h False) where go ti = case extLogFileKey remoteContentIdentifierExt (getTopFilePath (DiffTree.file ti)) of Nothing -> return () diff --git a/Remote/Helper/ExportImport.hs b/Remote/Helper/ExportImport.hs index 987256eb9a..377c4a7274 100644 --- a/Remote/Helper/ExportImport.hs +++ b/Remote/Helper/ExportImport.hs @@ -280,14 +280,14 @@ adjustExportImport' isexport isimport r rs = do Nothing -> ifM (liftIO $ atomically $ tryPutTMVar lcklckv ()) ( do db <- ContentIdentifier.openDb - ContentIdentifier.needsUpdateFromLog db >>= \case + db' <- ContentIdentifier.needsUpdateFromLog db >>= \case Just v -> do cidlck <- calcRepo' gitAnnexContentIdentifierLock withExclusiveLock cidlck $ ContentIdentifier.updateFromLog db v - Nothing -> noop - liftIO $ atomically $ putTMVar dbtv db - return db + Nothing -> pure db + liftIO $ atomically $ putTMVar dbtv db' + return db' -- loser waits for winner to open the db and -- can then also use its handle , liftIO $ atomically (readTMVar dbtv)