From 6fbd337e34cd62881ed5e21616e9c00d7bf35378 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Wed, 12 Oct 2022 15:21:19 -0400 Subject: [PATCH] avoid uncessary keys db writes; doubled speed! When running eg git-annex get, for each file it has to read from and write to the keys database. But it's reading exclusively from one table, and writing to a different table. So, it is not necessary to flush the write to the database before reading. This avoids writing the database once per file, instead it will buffer 1000 changes before writing. Benchmarking getting 1000 small files from a local origin, git-annex get now takes 13.62s, down from 22.41s! git-annex drop now takes 9.07s, down from 18.63s! Wowowowowowowow! (It would perhaps have been better if there were separate databases for the two tables. At least it would have avoided this complexity. Ah well, this is better than splitting the table in a annex.version upgrade.) Sponsored-by: Dartmouth College's Datalad project --- Annex/WorkTree.hs | 13 +---- CHANGELOG | 2 + Database/Keys.hs | 105 ++++++++++++++++++++++++---------------- Database/Keys/Handle.hs | 13 +++-- Database/Keys/Tables.hs | 38 +++++++++++++++ Upgrade/V7.hs | 4 +- git-annex.cabal | 1 + 7 files changed, 117 insertions(+), 59 deletions(-) create mode 100644 Database/Keys/Tables.hs diff --git a/Annex/WorkTree.hs b/Annex/WorkTree.hs index d7e423c6e8..95e0d18e2b 100644 --- a/Annex/WorkTree.hs +++ b/Annex/WorkTree.hs @@ -53,15 +53,6 @@ whenAnnexed a file = ifAnnexed file (a file) (return Nothing) ifAnnexed :: RawFilePath -> (Key -> Annex a) -> Annex a -> Annex a ifAnnexed file yes no = maybe no yes =<< lookupKey file -{- Find all annexed files and update the keys database for them. - - - - Normally the keys database is updated incrementally when it's being - - opened, and changes are noticed. Calling this explicitly allows - - running the update at an earlier point. - - - - All that needs to be done is to open the database, - - that will result in Database.Keys.reconcileStaged - - running, and doing the work. - -} +{- Find all annexed files and update the keys database for them. -} scanAnnexedFiles :: Annex () -scanAnnexedFiles = Database.Keys.runWriter (const noop) +scanAnnexedFiles = Database.Keys.updateDatabase diff --git a/CHANGELOG b/CHANGELOG index 56f2c781fe..6c045d7609 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,7 @@ git-annex (10.20221004) UNRELEASED; urgency=medium + * Doubled the speed of git-annex drop when operating on many files, + and of git-annex get when operating on many tiny files. * trust, untrust, semitrust, dead: Fix behavior when provided with multiple repositories to operate on. * trust, untrust, semitrust, dead: When provided with no parameters, diff --git a/Database/Keys.hs b/Database/Keys.hs index 6b4f3e4782..45f8d2f851 100644 --- a/Database/Keys.hs +++ b/Database/Keys.hs @@ -25,11 +25,13 @@ module Database.Keys ( removeInodeCache, isInodeKnown, runWriter, + updateDatabase, ) where import qualified Database.Keys.SQL as SQL import Database.Types import Database.Keys.Handle +import Database.Keys.Tables import qualified Database.Queue as H import Database.Init import Annex.Locations @@ -64,49 +66,53 @@ import Control.Concurrent.Async - If the database is already open, any writes are flushed to it, to ensure - consistency. - - - Any queued writes will be flushed before the read. + - Any queued writes to the table will be flushed before the read. -} -runReader :: Monoid v => (SQL.ReadHandle -> Annex v) -> Annex v -runReader a = do +runReader :: Monoid v => DbTable -> (SQL.ReadHandle -> Annex v) -> Annex v +runReader t a = do h <- Annex.getRead Annex.keysdbhandle withDbState h go where go DbUnavailable = return (mempty, DbUnavailable) - go st@(DbOpen qh) = do - liftIO $ H.flushDbQueue qh + go (DbOpen (qh, tableschanged)) = do + tableschanged' <- if isDbTableChanged tableschanged t + then do + liftIO $ H.flushDbQueue qh + return mempty + else return tableschanged v <- a (SQL.ReadHandle qh) - return (v, st) + return (v, DbOpen (qh, tableschanged')) go DbClosed = do - st' <- openDb False DbClosed - v <- case st' of - (DbOpen qh) -> a (SQL.ReadHandle qh) + st <- openDb False DbClosed + v <- case st of + (DbOpen (qh, _)) -> a (SQL.ReadHandle qh) _ -> return mempty - return (v, st') + return (v, st) -runReaderIO :: Monoid v => (SQL.ReadHandle -> IO v) -> Annex v -runReaderIO a = runReader (liftIO . a) +runReaderIO :: Monoid v => DbTable -> (SQL.ReadHandle -> IO v) -> Annex v +runReaderIO t a = runReader t (liftIO . a) {- Runs an action that writes to the database. Typically this is used to - queue changes, which will be flushed at a later point. - - The database is created if it doesn't exist yet. -} -runWriter :: (SQL.WriteHandle -> Annex ()) -> Annex () -runWriter a = do +runWriter :: DbTable -> (SQL.WriteHandle -> Annex ()) -> Annex () +runWriter t a = do h <- Annex.getRead Annex.keysdbhandle withDbState h go where - go st@(DbOpen qh) = do + go (DbOpen (qh, tableschanged)) = do v <- a (SQL.WriteHandle qh) - return (v, st) + return (v, DbOpen (qh, addDbTable tableschanged t)) go st = do st' <- openDb True st v <- case st' of - DbOpen qh -> a (SQL.WriteHandle qh) + DbOpen (qh, _) -> a (SQL.WriteHandle qh) _ -> error "internal" return (v, st') -runWriterIO :: (SQL.WriteHandle -> IO ()) -> Annex () -runWriterIO a = runWriter (liftIO . a) +runWriterIO :: DbTable -> (SQL.WriteHandle -> IO ()) -> Annex () +runWriterIO t a = runWriter t (liftIO . a) {- Opens the database, creating it if it doesn't exist yet. - @@ -139,8 +145,8 @@ openDb forwrite _ = do open db = do qh <- liftIO $ H.openDbQueue db SQL.containedTable - reconcileStaged qh - return $ DbOpen qh + tc <- reconcileStaged qh + return $ DbOpen (qh, tc) {- Closes the database if it was open. Any writes will be flushed to it. - @@ -155,12 +161,13 @@ flushDb :: Annex () flushDb = liftIO . flushDbQueue =<< Annex.getRead Annex.keysdbhandle addAssociatedFile :: Key -> TopFilePath -> Annex () -addAssociatedFile k f = runWriterIO $ SQL.addAssociatedFile k f +addAssociatedFile k f = runWriterIO AssociatedTable $ SQL.addAssociatedFile k f {- Note that the files returned were once associated with the key, but - some of them may not be any longer. -} getAssociatedFiles :: Key -> Annex [TopFilePath] -getAssociatedFiles k = emptyWhenBare $ runReaderIO $ SQL.getAssociatedFiles k +getAssociatedFiles k = emptyWhenBare $ runReaderIO AssociatedTable $ + SQL.getAssociatedFiles k {- Queries for associated files never return anything when in a bare - repository, since without a work tree there can be no associated files. @@ -186,10 +193,12 @@ getAssociatedFilesIncluding afile k = emptyWhenBare $ do {- Gets any keys that are on record as having a particular associated file. - (Should be one or none but the database doesn't enforce that.) -} getAssociatedKey :: TopFilePath -> Annex [Key] -getAssociatedKey f = emptyWhenBare $ runReaderIO $ SQL.getAssociatedKey f +getAssociatedKey f = emptyWhenBare $ runReaderIO AssociatedTable $ + SQL.getAssociatedKey f removeAssociatedFile :: Key -> TopFilePath -> Annex () -removeAssociatedFile k = runWriterIO . SQL.removeAssociatedFile k +removeAssociatedFile k = runWriterIO AssociatedTable . + SQL.removeAssociatedFile k {- Stats the files, and stores their InodeCaches. -} storeInodeCaches :: Key -> [RawFilePath] -> Annex () @@ -198,7 +207,7 @@ storeInodeCaches k fs = withTSDelta $ \d -> =<< liftIO (mapM (\f -> genInodeCache f d) fs) addInodeCaches :: Key -> [InodeCache] -> Annex () -addInodeCaches k is = runWriterIO $ SQL.addInodeCaches k is +addInodeCaches k is = runWriterIO ContentTable $ SQL.addInodeCaches k is {- A key may have multiple InodeCaches; one for the annex object, and one - for each pointer file that is a copy of it. @@ -210,18 +219,19 @@ addInodeCaches k is = runWriterIO $ SQL.addInodeCaches k is - for pointer files, but none recorded for the annex object. -} getInodeCaches :: Key -> Annex [InodeCache] -getInodeCaches = runReaderIO . SQL.getInodeCaches +getInodeCaches = runReaderIO ContentTable . SQL.getInodeCaches {- Remove all inodes cached for a key. -} removeInodeCaches :: Key -> Annex () -removeInodeCaches = runWriterIO . SQL.removeInodeCaches +removeInodeCaches = runWriterIO ContentTable . SQL.removeInodeCaches {- Remove cached inodes, for any key. -} removeInodeCache :: InodeCache -> Annex () -removeInodeCache = runWriterIO . SQL.removeInodeCache +removeInodeCache = runWriterIO ContentTable . SQL.removeInodeCache isInodeKnown :: InodeCache -> SentinalStatus -> Annex Bool -isInodeKnown i s = or <$> runReaderIO ((:[]) <$$> SQL.isInodeKnown i s) +isInodeKnown i s = or <$> runReaderIO ContentTable + ((:[]) <$$> SQL.isInodeKnown i s) {- Looks at staged changes to annexed files, and updates the keys database, - so that its information is consistent with the state of the repository. @@ -250,18 +260,21 @@ isInodeKnown i s = or <$> runReaderIO ((:[]) <$$> SQL.isInodeKnown i s) - So when using getAssociatedFiles, have to make sure the file still - is an associated file. -} -reconcileStaged :: H.DbQueue -> Annex () -reconcileStaged qh = unlessM (Git.Config.isBare <$> gitRepo) $ do - gitindex <- inRepo currentIndexFile - indexcache <- fromRawFilePath <$> calcRepo' gitAnnexKeysDbIndexCache - withTSDelta (liftIO . genInodeCache gitindex) >>= \case - Just cur -> readindexcache indexcache >>= \case - Nothing -> go cur indexcache =<< getindextree - Just prev -> ifM (compareInodeCaches prev cur) - ( noop - , go cur indexcache =<< getindextree - ) - Nothing -> noop +reconcileStaged :: H.DbQueue -> Annex DbTablesChanged +reconcileStaged qh = ifM (Git.Config.isBare <$> gitRepo) + ( return mempty + , do + gitindex <- inRepo currentIndexFile + indexcache <- fromRawFilePath <$> calcRepo' gitAnnexKeysDbIndexCache + withTSDelta (liftIO . genInodeCache gitindex) >>= \case + Just cur -> readindexcache indexcache >>= \case + Nothing -> go cur indexcache =<< getindextree + Just prev -> ifM (compareInodeCaches prev cur) + ( return mempty + , go cur indexcache =<< getindextree + ) + Nothing -> return mempty + ) where lastindexref = Ref "refs/annex/last-index" @@ -286,6 +299,7 @@ reconcileStaged qh = unlessM (Git.Config.isBare <$> gitRepo) $ do -- against next time. inRepo $ update' lastindexref newtree fastDebug "Database.Keys" "reconcileStaged end" + return (DbTablesChanged True True) -- git write-tree will fail if the index is locked or when there is -- a merge conflict. To get up-to-date with the current index, -- diff --staged with the old index tree. The current index tree @@ -307,6 +321,7 @@ reconcileStaged qh = unlessM (Git.Config.isBare <$> gitRepo) $ do void $ updatetodiff g Nothing "--staged" (procmergeconflictdiff mdfeeder) fastDebug "Database.Keys" "reconcileStaged end" + return (DbTablesChanged True True) updatetodiff g old new processor = do (l, cleanup) <- pipeNullSplit' (diff old new) g @@ -482,3 +497,9 @@ reconcileStaged qh = unlessM (Git.Config.isBare <$> gitRepo) $ do largediff :: Int largediff = 1000 +{- Normally the keys database is updated incrementally when opened, + - by reconcileStaged. Calling this explicitly allows running the + - update at an earlier point. + -} +updateDatabase :: Annex () +updateDatabase = runWriter ContentTable (const noop) diff --git a/Database/Keys/Handle.hs b/Database/Keys/Handle.hs index ed7cc6e6c8..1e4a85427b 100644 --- a/Database/Keys/Handle.hs +++ b/Database/Keys/Handle.hs @@ -15,6 +15,7 @@ module Database.Keys.Handle ( ) where import qualified Database.Queue as H +import Database.Keys.Tables import Utility.Exception import Utility.DebugLocks @@ -29,7 +30,7 @@ newtype DbHandle = DbHandle (MVar DbState) -- The database can be closed or open, but it also may have been -- tried to open (for read) and didn't exist yet or is not readable. -data DbState = DbClosed | DbOpen H.DbQueue | DbUnavailable +data DbState = DbClosed | DbOpen (H.DbQueue, DbTablesChanged) | DbUnavailable newDbHandle :: IO DbHandle newDbHandle = DbHandle <$> newMVar DbClosed @@ -52,15 +53,17 @@ withDbState (DbHandle mvar) a = do return v flushDbQueue :: DbHandle -> IO () -flushDbQueue (DbHandle mvar) = go =<< debugLocks (readMVar mvar) +flushDbQueue h = withDbState h go where - go (DbOpen qh) = H.flushDbQueue qh - go _ = return () + go (DbOpen (qh, _)) = do + H.flushDbQueue qh + return ((), DbOpen (qh, mempty)) + go st = return ((), st) closeDbHandle :: DbHandle -> IO () closeDbHandle h = withDbState h go where - go (DbOpen qh) = do + go (DbOpen (qh, _)) = do H.closeDbQueue qh return ((), DbClosed) go st = return ((), st) diff --git a/Database/Keys/Tables.hs b/Database/Keys/Tables.hs new file mode 100644 index 0000000000..ab6a4fb757 --- /dev/null +++ b/Database/Keys/Tables.hs @@ -0,0 +1,38 @@ +{- Keeping track of which tables in the keys database have changed + - + - Copyright 2022 Joey Hess + - + - Licensed under the GNU AGPL version 3 or higher. + -} + +module Database.Keys.Tables where + +import Data.Monoid +import qualified Data.Semigroup as Sem +import Prelude + +data DbTable = AssociatedTable | ContentTable + deriving (Eq, Show) + +data DbTablesChanged = DbTablesChanged + { associatedTable :: Bool + , contentTable :: Bool + } + deriving (Show) + +instance Sem.Semigroup DbTablesChanged where + a <> b = DbTablesChanged + { associatedTable = associatedTable a || associatedTable b + , contentTable = contentTable a || contentTable b + } + +instance Monoid DbTablesChanged where + mempty = DbTablesChanged False False + +addDbTable :: DbTablesChanged -> DbTable -> DbTablesChanged +addDbTable ts AssociatedTable = ts { associatedTable = True } +addDbTable ts ContentTable = ts { contentTable = True } + +isDbTableChanged :: DbTablesChanged -> DbTable -> Bool +isDbTableChanged ts AssociatedTable = associatedTable ts +isDbTableChanged ts ContentTable = contentTable ts diff --git a/Upgrade/V7.hs b/Upgrade/V7.hs index 28e808b309..219c11ed14 100644 --- a/Upgrade/V7.hs +++ b/Upgrade/V7.hs @@ -16,6 +16,7 @@ import Types.Upgrade import Annex.CatFile import qualified Database.Keys import qualified Database.Keys.SQL +import Database.Keys.Tables import qualified Git.LsFiles as LsFiles import qualified Git import Git.FilePath @@ -114,8 +115,9 @@ populateKeysDb = unlessM isBareRepo $ do Nothing -> noop Just k -> do topf <- inRepo $ toTopFilePath $ toRawFilePath f - Database.Keys.runWriter $ \h -> liftIO $ do + Database.Keys.runWriter AssociatedTable $ \h -> liftIO $ Database.Keys.SQL.addAssociatedFile k topf h + Database.Keys.runWriter ContentTable $ \h -> liftIO $ Database.Keys.SQL.addInodeCaches k [ic] h liftIO $ void cleanup Database.Keys.closeDb diff --git a/git-annex.cabal b/git-annex.cabal index ce0cd60ea7..c60e627682 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -830,6 +830,7 @@ Executable git-annex Database.Init Database.Keys Database.Keys.Handle + Database.Keys.Tables Database.Keys.SQL Database.Queue Database.Types