speed up very first tree import by 25%

Reading from the cidsdb is responsible for about 25% of the runtime of
an import. Since the cidmap is used to store the same information in
ram, the cidsdb is not written to during an import any longer. And so,
if it started off empty (and updateFromLog wasn't needed), those reads
can just be skipped.

This is kind of a cheesy optimisation, since after any import from any
special remote, the database will no longer be empty, so it's a single
use optimisation. But it's probably not uncommon to start by importing a
lot of files, and it can save a lot of time then.

Sponsored-by: Brock Spratlen on Patreon
This commit is contained in:
Joey Hess 2023-06-02 13:30:30 -04:00
parent b43fb4923f
commit fe1b2dfb4b
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
4 changed files with 38 additions and 28 deletions

View file

@ -626,9 +626,9 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
-- avoid two threads both importing the same content identifier. -- avoid two threads both importing the same content identifier.
importing <- liftIO $ newTVarIO S.empty importing <- liftIO $ newTVarIO S.empty
withciddb $ \db -> do withciddb $ \db -> do
CIDDb.needsUpdateFromLog db db' <- CIDDb.needsUpdateFromLog db
>>= maybe noop (CIDDb.updateFromLog db) >>= maybe (pure db) (CIDDb.updateFromLog db)
(prepclock (run cidmap importing db)) (prepclock (run cidmap importing db'))
where where
-- When not importing content, reuse the same vector -- When not importing content, reuse the same vector
-- clock for all state that's recorded. This can save -- clock for all state that's recorded. This can save
@ -925,11 +925,17 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
getTopFilePath subdir P.</> fromImportLocation loc getTopFilePath subdir P.</> fromImportLocation loc
getcidkey cidmap db cid = liftIO $ getcidkey cidmap db cid = liftIO $
CIDDb.getContentIdentifierKeys db rs cid >>= \case -- Avoiding querying the database when it's empty speeds up
[] -> atomically $ -- the initial import.
maybeToList . M.lookup cid <$> readTVar cidmap if CIDDb.databaseIsEmpty db
then getcidkeymap cidmap cid
else CIDDb.getContentIdentifierKeys db rs cid >>= \case
[] -> getcidkeymap cidmap cid
l -> return l l -> return l
getcidkeymap cidmap cid =
atomically $ maybeToList . M.lookup cid <$> readTVar cidmap
recordcidkey cidmap cid k = do recordcidkey cidmap cid k = do
liftIO $ atomically $ modifyTVar' cidmap $ liftIO $ atomically $ modifyTVar' cidmap $
M.insert cid k M.insert cid k

View file

@ -77,8 +77,7 @@ git-annex (10.20230408) UNRELEASED; urgency=medium
* repair: Fix handling of git ref names on Windows. * repair: Fix handling of git ref names on Windows.
* Large speed up to importing trees from special remotes that contain a lot * Large speed up to importing trees from special remotes that contain a lot
of files, by only processing changed files. of files, by only processing changed files.
* Speed up importing trees from special remotes somewhat by avoiding * Some other speedups to importing trees from special remotes.
redundant writes to sqlite database.
-- Joey Hess <id@joeyh.name> Sat, 08 Apr 2023 13:57:18 -0400 -- Joey Hess <id@joeyh.name> Sat, 08 Apr 2023 13:57:18 -0400

View file

@ -1,6 +1,6 @@
{- Sqlite database of ContentIdentifiers imported from special remotes. {- Sqlite database of ContentIdentifiers imported from special remotes.
- -
- Copyright 2019 Joey Hess <id@joeyh.name> - Copyright 2019-2023 Joey Hess <id@joeyh.name>
-: -:
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -19,6 +19,7 @@
module Database.ContentIdentifier ( module Database.ContentIdentifier (
ContentIdentifierHandle, ContentIdentifierHandle,
databaseIsEmpty,
openDb, openDb,
closeDb, closeDb,
flushDbQueue, flushDbQueue,
@ -57,7 +58,10 @@ import Database.Persist.Sqlite (runSqlite)
import qualified System.FilePath.ByteString as P import qualified System.FilePath.ByteString as P
import qualified Data.Text as T import qualified Data.Text as T
data ContentIdentifierHandle = ContentIdentifierHandle H.DbQueue data ContentIdentifierHandle = ContentIdentifierHandle H.DbQueue Bool
databaseIsEmpty :: ContentIdentifierHandle -> Bool
databaseIsEmpty (ContentIdentifierHandle _ b) = b
share [mkPersist sqlSettings, mkMigrate "migrateContentIdentifier"] [persistLowerCase| share [mkPersist sqlSettings, mkMigrate "migrateContentIdentifier"] [persistLowerCase|
ContentIdentifiers ContentIdentifiers
@ -81,23 +85,23 @@ openDb :: Annex ContentIdentifierHandle
openDb = do openDb = do
dbdir <- calcRepo' gitAnnexContentIdentifierDbDir dbdir <- calcRepo' gitAnnexContentIdentifierDbDir
let db = dbdir P.</> "db" let db = dbdir P.</> "db"
ifM (liftIO $ not <$> R.doesPathExist db) isnew <- liftIO $ not <$> R.doesPathExist db
( initDb db $ void $ if isnew
then initDb db $ void $
runMigrationSilent migrateContentIdentifier runMigrationSilent migrateContentIdentifier
-- Migrate from old version of database, which had -- Migrate from old version of database, which had
-- an incorrect uniqueness constraint on the -- an incorrect uniqueness constraint on the
-- ContentIdentifiers table. -- ContentIdentifiers table.
, liftIO $ runSqlite (T.pack (fromRawFilePath db)) $ void $ else liftIO $ runSqlite (T.pack (fromRawFilePath db)) $ void $
runMigrationSilent migrateContentIdentifier runMigrationSilent migrateContentIdentifier
)
h <- liftIO $ H.openDbQueue db "content_identifiers" h <- liftIO $ H.openDbQueue db "content_identifiers"
return $ ContentIdentifierHandle h return $ ContentIdentifierHandle h isnew
closeDb :: ContentIdentifierHandle -> Annex () closeDb :: ContentIdentifierHandle -> Annex ()
closeDb (ContentIdentifierHandle h) = liftIO $ H.closeDbQueue h closeDb (ContentIdentifierHandle h _) = liftIO $ H.closeDbQueue h
queueDb :: ContentIdentifierHandle -> SqlPersistM () -> IO () queueDb :: ContentIdentifierHandle -> SqlPersistM () -> IO ()
queueDb (ContentIdentifierHandle h) = H.queueDb h checkcommit queueDb (ContentIdentifierHandle h _) = H.queueDb h checkcommit
where where
-- commit queue after 1000 changes -- commit queue after 1000 changes
checkcommit sz _lastcommittime checkcommit sz _lastcommittime
@ -105,7 +109,7 @@ queueDb (ContentIdentifierHandle h) = H.queueDb h checkcommit
| otherwise = return False | otherwise = return False
flushDbQueue :: ContentIdentifierHandle -> IO () flushDbQueue :: ContentIdentifierHandle -> IO ()
flushDbQueue (ContentIdentifierHandle h) = H.flushDbQueue h flushDbQueue (ContentIdentifierHandle h _) = H.flushDbQueue h
-- Be sure to also update the git-annex branch when using this. -- Be sure to also update the git-annex branch when using this.
recordContentIdentifier :: ContentIdentifierHandle -> RemoteStateHandle -> ContentIdentifier -> Key -> IO () recordContentIdentifier :: ContentIdentifierHandle -> RemoteStateHandle -> ContentIdentifier -> Key -> IO ()
@ -113,7 +117,7 @@ recordContentIdentifier h (RemoteStateHandle u) cid k = queueDb h $ do
void $ insertUniqueFast $ ContentIdentifiers u cid k void $ insertUniqueFast $ ContentIdentifiers u cid k
getContentIdentifiers :: ContentIdentifierHandle -> RemoteStateHandle -> Key -> IO [ContentIdentifier] getContentIdentifiers :: ContentIdentifierHandle -> RemoteStateHandle -> Key -> IO [ContentIdentifier]
getContentIdentifiers (ContentIdentifierHandle h) (RemoteStateHandle u) k = getContentIdentifiers (ContentIdentifierHandle h _) (RemoteStateHandle u) k =
H.queryDbQueue h $ do H.queryDbQueue h $ do
l <- selectList l <- selectList
[ ContentIdentifiersKey ==. k [ ContentIdentifiersKey ==. k
@ -122,7 +126,7 @@ getContentIdentifiers (ContentIdentifierHandle h) (RemoteStateHandle u) k =
return $ map (contentIdentifiersCid . entityVal) l return $ map (contentIdentifiersCid . entityVal) l
getContentIdentifierKeys :: ContentIdentifierHandle -> RemoteStateHandle -> ContentIdentifier -> IO [Key] getContentIdentifierKeys :: ContentIdentifierHandle -> RemoteStateHandle -> ContentIdentifier -> IO [Key]
getContentIdentifierKeys (ContentIdentifierHandle h) (RemoteStateHandle u) cid = getContentIdentifierKeys (ContentIdentifierHandle h _) (RemoteStateHandle u) cid =
H.queryDbQueue h $ do H.queryDbQueue h $ do
l <- selectList l <- selectList
[ ContentIdentifiersCid ==. cid [ ContentIdentifiersCid ==. cid
@ -136,7 +140,7 @@ recordAnnexBranchTree h s = queueDb h $ do
void $ insertUniqueFast $ AnnexBranch $ toSSha s void $ insertUniqueFast $ AnnexBranch $ toSSha s
getAnnexBranchTree :: ContentIdentifierHandle -> IO Sha getAnnexBranchTree :: ContentIdentifierHandle -> IO Sha
getAnnexBranchTree (ContentIdentifierHandle h) = H.queryDbQueue h $ do getAnnexBranchTree (ContentIdentifierHandle h _) = H.queryDbQueue h $ do
l <- selectList ([] :: [Filter AnnexBranch]) [] l <- selectList ([] :: [Filter AnnexBranch]) []
case l of case l of
(s:[]) -> return $ fromSSha $ annexBranchTree $ entityVal s (s:[]) -> return $ fromSSha $ annexBranchTree $ entityVal s
@ -153,8 +157,8 @@ needsUpdateFromLog db = do
_ -> return Nothing _ -> return Nothing
{- The database should be locked for write when calling this. -} {- The database should be locked for write when calling this. -}
updateFromLog :: ContentIdentifierHandle -> (Sha, Sha) -> Annex () updateFromLog :: ContentIdentifierHandle -> (Sha, Sha) -> Annex ContentIdentifierHandle
updateFromLog db (oldtree, currtree) = do updateFromLog db@(ContentIdentifierHandle h _) (oldtree, currtree) = do
(l, cleanup) <- inRepo $ (l, cleanup) <- inRepo $
DiffTree.diffTreeRecursive oldtree currtree DiffTree.diffTreeRecursive oldtree currtree
mapM_ go l mapM_ go l
@ -162,6 +166,7 @@ updateFromLog db (oldtree, currtree) = do
liftIO $ do liftIO $ do
recordAnnexBranchTree db currtree recordAnnexBranchTree db currtree
flushDbQueue db flushDbQueue db
return (ContentIdentifierHandle h False)
where where
go ti = case extLogFileKey remoteContentIdentifierExt (getTopFilePath (DiffTree.file ti)) of go ti = case extLogFileKey remoteContentIdentifierExt (getTopFilePath (DiffTree.file ti)) of
Nothing -> return () Nothing -> return ()

View file

@ -280,14 +280,14 @@ adjustExportImport' isexport isimport r rs = do
Nothing -> ifM (liftIO $ atomically $ tryPutTMVar lcklckv ()) Nothing -> ifM (liftIO $ atomically $ tryPutTMVar lcklckv ())
( do ( do
db <- ContentIdentifier.openDb db <- ContentIdentifier.openDb
ContentIdentifier.needsUpdateFromLog db >>= \case db' <- ContentIdentifier.needsUpdateFromLog db >>= \case
Just v -> do Just v -> do
cidlck <- calcRepo' gitAnnexContentIdentifierLock cidlck <- calcRepo' gitAnnexContentIdentifierLock
withExclusiveLock cidlck $ withExclusiveLock cidlck $
ContentIdentifier.updateFromLog db v ContentIdentifier.updateFromLog db v
Nothing -> noop Nothing -> pure db
liftIO $ atomically $ putTMVar dbtv db liftIO $ atomically $ putTMVar dbtv db'
return db return db'
-- loser waits for winner to open the db and -- loser waits for winner to open the db and
-- can then also use its handle -- can then also use its handle
, liftIO $ atomically (readTMVar dbtv) , liftIO $ atomically (readTMVar dbtv)