diff --git a/Annex/Import.hs b/Annex/Import.hs index 074b7e3b69..de98ba190b 100644 --- a/Annex/Import.hs +++ b/Annex/Import.hs @@ -626,9 +626,9 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec -- avoid two threads both importing the same content identifier. importing <- liftIO $ newTVarIO S.empty withciddb $ \db -> do - CIDDb.needsUpdateFromLog db - >>= maybe noop (CIDDb.updateFromLog db) - (prepclock (run cidmap importing db)) + db' <- CIDDb.needsUpdateFromLog db + >>= maybe (pure db) (CIDDb.updateFromLog db) + (prepclock (run cidmap importing db')) where -- When not importing content, reuse the same vector -- clock for all state that's recorded. This can save @@ -925,10 +925,16 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec getTopFilePath subdir P. fromImportLocation loc getcidkey cidmap db cid = liftIO $ - CIDDb.getContentIdentifierKeys db rs cid >>= \case - [] -> atomically $ - maybeToList . M.lookup cid <$> readTVar cidmap - l -> return l + -- Avoiding querying the database when it's empty speeds up + -- the initial import. + if CIDDb.databaseIsEmpty db + then getcidkeymap cidmap cid + else CIDDb.getContentIdentifierKeys db rs cid >>= \case + [] -> getcidkeymap cidmap cid + l -> return l + + getcidkeymap cidmap cid = + atomically $ maybeToList . M.lookup cid <$> readTVar cidmap recordcidkey cidmap cid k = do liftIO $ atomically $ modifyTVar' cidmap $ diff --git a/CHANGELOG b/CHANGELOG index 66c77e6857..c89ab86e64 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -77,8 +77,7 @@ git-annex (10.20230408) UNRELEASED; urgency=medium * repair: Fix handling of git ref names on Windows. * Large speed up to importing trees from special remotes that contain a lot of files, by only processing changed files. - * Speed up importing trees from special remotes somewhat by avoiding - redundant writes to sqlite database. + * Some other speedups to importing trees from special remotes. -- Joey Hess Sat, 08 Apr 2023 13:57:18 -0400 diff --git a/Database/ContentIdentifier.hs b/Database/ContentIdentifier.hs index 4cee459d3f..7eb1bd0d58 100644 --- a/Database/ContentIdentifier.hs +++ b/Database/ContentIdentifier.hs @@ -1,6 +1,6 @@ {- Sqlite database of ContentIdentifiers imported from special remotes. - - - Copyright 2019 Joey Hess + - Copyright 2019-2023 Joey Hess -: - Licensed under the GNU AGPL version 3 or higher. -} @@ -19,6 +19,7 @@ module Database.ContentIdentifier ( ContentIdentifierHandle, + databaseIsEmpty, openDb, closeDb, flushDbQueue, @@ -57,7 +58,10 @@ import Database.Persist.Sqlite (runSqlite) import qualified System.FilePath.ByteString as P import qualified Data.Text as T -data ContentIdentifierHandle = ContentIdentifierHandle H.DbQueue +data ContentIdentifierHandle = ContentIdentifierHandle H.DbQueue Bool + +databaseIsEmpty :: ContentIdentifierHandle -> Bool +databaseIsEmpty (ContentIdentifierHandle _ b) = b share [mkPersist sqlSettings, mkMigrate "migrateContentIdentifier"] [persistLowerCase| ContentIdentifiers @@ -81,23 +85,23 @@ openDb :: Annex ContentIdentifierHandle openDb = do dbdir <- calcRepo' gitAnnexContentIdentifierDbDir let db = dbdir P. "db" - ifM (liftIO $ not <$> R.doesPathExist db) - ( initDb db $ void $ + isnew <- liftIO $ not <$> R.doesPathExist db + if isnew + then initDb db $ void $ runMigrationSilent migrateContentIdentifier -- Migrate from old version of database, which had -- an incorrect uniqueness constraint on the -- ContentIdentifiers table. - , liftIO $ runSqlite (T.pack (fromRawFilePath db)) $ void $ + else liftIO $ runSqlite (T.pack (fromRawFilePath db)) $ void $ runMigrationSilent migrateContentIdentifier - ) h <- liftIO $ H.openDbQueue db "content_identifiers" - return $ ContentIdentifierHandle h + return $ ContentIdentifierHandle h isnew closeDb :: ContentIdentifierHandle -> Annex () -closeDb (ContentIdentifierHandle h) = liftIO $ H.closeDbQueue h +closeDb (ContentIdentifierHandle h _) = liftIO $ H.closeDbQueue h queueDb :: ContentIdentifierHandle -> SqlPersistM () -> IO () -queueDb (ContentIdentifierHandle h) = H.queueDb h checkcommit +queueDb (ContentIdentifierHandle h _) = H.queueDb h checkcommit where -- commit queue after 1000 changes checkcommit sz _lastcommittime @@ -105,7 +109,7 @@ queueDb (ContentIdentifierHandle h) = H.queueDb h checkcommit | otherwise = return False flushDbQueue :: ContentIdentifierHandle -> IO () -flushDbQueue (ContentIdentifierHandle h) = H.flushDbQueue h +flushDbQueue (ContentIdentifierHandle h _) = H.flushDbQueue h -- Be sure to also update the git-annex branch when using this. recordContentIdentifier :: ContentIdentifierHandle -> RemoteStateHandle -> ContentIdentifier -> Key -> IO () @@ -113,7 +117,7 @@ recordContentIdentifier h (RemoteStateHandle u) cid k = queueDb h $ do void $ insertUniqueFast $ ContentIdentifiers u cid k getContentIdentifiers :: ContentIdentifierHandle -> RemoteStateHandle -> Key -> IO [ContentIdentifier] -getContentIdentifiers (ContentIdentifierHandle h) (RemoteStateHandle u) k = +getContentIdentifiers (ContentIdentifierHandle h _) (RemoteStateHandle u) k = H.queryDbQueue h $ do l <- selectList [ ContentIdentifiersKey ==. k @@ -122,7 +126,7 @@ getContentIdentifiers (ContentIdentifierHandle h) (RemoteStateHandle u) k = return $ map (contentIdentifiersCid . entityVal) l getContentIdentifierKeys :: ContentIdentifierHandle -> RemoteStateHandle -> ContentIdentifier -> IO [Key] -getContentIdentifierKeys (ContentIdentifierHandle h) (RemoteStateHandle u) cid = +getContentIdentifierKeys (ContentIdentifierHandle h _) (RemoteStateHandle u) cid = H.queryDbQueue h $ do l <- selectList [ ContentIdentifiersCid ==. cid @@ -136,7 +140,7 @@ recordAnnexBranchTree h s = queueDb h $ do void $ insertUniqueFast $ AnnexBranch $ toSSha s getAnnexBranchTree :: ContentIdentifierHandle -> IO Sha -getAnnexBranchTree (ContentIdentifierHandle h) = H.queryDbQueue h $ do +getAnnexBranchTree (ContentIdentifierHandle h _) = H.queryDbQueue h $ do l <- selectList ([] :: [Filter AnnexBranch]) [] case l of (s:[]) -> return $ fromSSha $ annexBranchTree $ entityVal s @@ -153,8 +157,8 @@ needsUpdateFromLog db = do _ -> return Nothing {- The database should be locked for write when calling this. -} -updateFromLog :: ContentIdentifierHandle -> (Sha, Sha) -> Annex () -updateFromLog db (oldtree, currtree) = do +updateFromLog :: ContentIdentifierHandle -> (Sha, Sha) -> Annex ContentIdentifierHandle +updateFromLog db@(ContentIdentifierHandle h _) (oldtree, currtree) = do (l, cleanup) <- inRepo $ DiffTree.diffTreeRecursive oldtree currtree mapM_ go l @@ -162,6 +166,7 @@ updateFromLog db (oldtree, currtree) = do liftIO $ do recordAnnexBranchTree db currtree flushDbQueue db + return (ContentIdentifierHandle h False) where go ti = case extLogFileKey remoteContentIdentifierExt (getTopFilePath (DiffTree.file ti)) of Nothing -> return () diff --git a/Remote/Helper/ExportImport.hs b/Remote/Helper/ExportImport.hs index 987256eb9a..377c4a7274 100644 --- a/Remote/Helper/ExportImport.hs +++ b/Remote/Helper/ExportImport.hs @@ -280,14 +280,14 @@ adjustExportImport' isexport isimport r rs = do Nothing -> ifM (liftIO $ atomically $ tryPutTMVar lcklckv ()) ( do db <- ContentIdentifier.openDb - ContentIdentifier.needsUpdateFromLog db >>= \case + db' <- ContentIdentifier.needsUpdateFromLog db >>= \case Just v -> do cidlck <- calcRepo' gitAnnexContentIdentifierLock withExclusiveLock cidlck $ ContentIdentifier.updateFromLog db v - Nothing -> noop - liftIO $ atomically $ putTMVar dbtv db - return db + Nothing -> pure db + liftIO $ atomically $ putTMVar dbtv db' + return db' -- loser waits for winner to open the db and -- can then also use its handle , liftIO $ atomically (readTMVar dbtv)