avoid uncessary keys db writes; doubled speed!

When running eg git-annex get, for each file it has to read from and
write to the keys database. But it's reading exclusively from one table,
and writing to a different table. So, it is not necessary to flush the
write to the database before reading. This avoids writing the database
once per file, instead it will buffer 1000 changes before writing.

Benchmarking getting 1000 small files from a local origin,
git-annex get now takes 13.62s, down from 22.41s!
git-annex drop now takes 9.07s, down from 18.63s!
Wowowowowowowow!

(It would perhaps have been better if there were separate databases for
the two tables. At least it would have avoided this complexity. Ah well,
this is better than splitting the table in a annex.version upgrade.)

Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
Joey Hess 2022-10-12 15:21:19 -04:00
parent ba7ecbc6a9
commit 6fbd337e34
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
7 changed files with 117 additions and 59 deletions

View file

@ -53,15 +53,6 @@ whenAnnexed a file = ifAnnexed file (a file) (return Nothing)
ifAnnexed :: RawFilePath -> (Key -> Annex a) -> Annex a -> Annex a ifAnnexed :: RawFilePath -> (Key -> Annex a) -> Annex a -> Annex a
ifAnnexed file yes no = maybe no yes =<< lookupKey file ifAnnexed file yes no = maybe no yes =<< lookupKey file
{- Find all annexed files and update the keys database for them. {- Find all annexed files and update the keys database for them. -}
-
- Normally the keys database is updated incrementally when it's being
- opened, and changes are noticed. Calling this explicitly allows
- running the update at an earlier point.
-
- All that needs to be done is to open the database,
- that will result in Database.Keys.reconcileStaged
- running, and doing the work.
-}
scanAnnexedFiles :: Annex () scanAnnexedFiles :: Annex ()
scanAnnexedFiles = Database.Keys.runWriter (const noop) scanAnnexedFiles = Database.Keys.updateDatabase

View file

@ -1,5 +1,7 @@
git-annex (10.20221004) UNRELEASED; urgency=medium git-annex (10.20221004) UNRELEASED; urgency=medium
* Doubled the speed of git-annex drop when operating on many files,
and of git-annex get when operating on many tiny files.
* trust, untrust, semitrust, dead: Fix behavior when provided with * trust, untrust, semitrust, dead: Fix behavior when provided with
multiple repositories to operate on. multiple repositories to operate on.
* trust, untrust, semitrust, dead: When provided with no parameters, * trust, untrust, semitrust, dead: When provided with no parameters,

View file

@ -25,11 +25,13 @@ module Database.Keys (
removeInodeCache, removeInodeCache,
isInodeKnown, isInodeKnown,
runWriter, runWriter,
updateDatabase,
) where ) where
import qualified Database.Keys.SQL as SQL import qualified Database.Keys.SQL as SQL
import Database.Types import Database.Types
import Database.Keys.Handle import Database.Keys.Handle
import Database.Keys.Tables
import qualified Database.Queue as H import qualified Database.Queue as H
import Database.Init import Database.Init
import Annex.Locations import Annex.Locations
@ -64,49 +66,53 @@ import Control.Concurrent.Async
- If the database is already open, any writes are flushed to it, to ensure - If the database is already open, any writes are flushed to it, to ensure
- consistency. - consistency.
- -
- Any queued writes will be flushed before the read. - Any queued writes to the table will be flushed before the read.
-} -}
runReader :: Monoid v => (SQL.ReadHandle -> Annex v) -> Annex v runReader :: Monoid v => DbTable -> (SQL.ReadHandle -> Annex v) -> Annex v
runReader a = do runReader t a = do
h <- Annex.getRead Annex.keysdbhandle h <- Annex.getRead Annex.keysdbhandle
withDbState h go withDbState h go
where where
go DbUnavailable = return (mempty, DbUnavailable) go DbUnavailable = return (mempty, DbUnavailable)
go st@(DbOpen qh) = do go (DbOpen (qh, tableschanged)) = do
tableschanged' <- if isDbTableChanged tableschanged t
then do
liftIO $ H.flushDbQueue qh liftIO $ H.flushDbQueue qh
return mempty
else return tableschanged
v <- a (SQL.ReadHandle qh) v <- a (SQL.ReadHandle qh)
return (v, st) return (v, DbOpen (qh, tableschanged'))
go DbClosed = do go DbClosed = do
st' <- openDb False DbClosed st <- openDb False DbClosed
v <- case st' of v <- case st of
(DbOpen qh) -> a (SQL.ReadHandle qh) (DbOpen (qh, _)) -> a (SQL.ReadHandle qh)
_ -> return mempty _ -> return mempty
return (v, st') return (v, st)
runReaderIO :: Monoid v => (SQL.ReadHandle -> IO v) -> Annex v runReaderIO :: Monoid v => DbTable -> (SQL.ReadHandle -> IO v) -> Annex v
runReaderIO a = runReader (liftIO . a) runReaderIO t a = runReader t (liftIO . a)
{- Runs an action that writes to the database. Typically this is used to {- Runs an action that writes to the database. Typically this is used to
- queue changes, which will be flushed at a later point. - queue changes, which will be flushed at a later point.
- -
- The database is created if it doesn't exist yet. -} - The database is created if it doesn't exist yet. -}
runWriter :: (SQL.WriteHandle -> Annex ()) -> Annex () runWriter :: DbTable -> (SQL.WriteHandle -> Annex ()) -> Annex ()
runWriter a = do runWriter t a = do
h <- Annex.getRead Annex.keysdbhandle h <- Annex.getRead Annex.keysdbhandle
withDbState h go withDbState h go
where where
go st@(DbOpen qh) = do go (DbOpen (qh, tableschanged)) = do
v <- a (SQL.WriteHandle qh) v <- a (SQL.WriteHandle qh)
return (v, st) return (v, DbOpen (qh, addDbTable tableschanged t))
go st = do go st = do
st' <- openDb True st st' <- openDb True st
v <- case st' of v <- case st' of
DbOpen qh -> a (SQL.WriteHandle qh) DbOpen (qh, _) -> a (SQL.WriteHandle qh)
_ -> error "internal" _ -> error "internal"
return (v, st') return (v, st')
runWriterIO :: (SQL.WriteHandle -> IO ()) -> Annex () runWriterIO :: DbTable -> (SQL.WriteHandle -> IO ()) -> Annex ()
runWriterIO a = runWriter (liftIO . a) runWriterIO t a = runWriter t (liftIO . a)
{- Opens the database, creating it if it doesn't exist yet. {- Opens the database, creating it if it doesn't exist yet.
- -
@ -139,8 +145,8 @@ openDb forwrite _ = do
open db = do open db = do
qh <- liftIO $ H.openDbQueue db SQL.containedTable qh <- liftIO $ H.openDbQueue db SQL.containedTable
reconcileStaged qh tc <- reconcileStaged qh
return $ DbOpen qh return $ DbOpen (qh, tc)
{- Closes the database if it was open. Any writes will be flushed to it. {- Closes the database if it was open. Any writes will be flushed to it.
- -
@ -155,12 +161,13 @@ flushDb :: Annex ()
flushDb = liftIO . flushDbQueue =<< Annex.getRead Annex.keysdbhandle flushDb = liftIO . flushDbQueue =<< Annex.getRead Annex.keysdbhandle
addAssociatedFile :: Key -> TopFilePath -> Annex () addAssociatedFile :: Key -> TopFilePath -> Annex ()
addAssociatedFile k f = runWriterIO $ SQL.addAssociatedFile k f addAssociatedFile k f = runWriterIO AssociatedTable $ SQL.addAssociatedFile k f
{- Note that the files returned were once associated with the key, but {- Note that the files returned were once associated with the key, but
- some of them may not be any longer. -} - some of them may not be any longer. -}
getAssociatedFiles :: Key -> Annex [TopFilePath] getAssociatedFiles :: Key -> Annex [TopFilePath]
getAssociatedFiles k = emptyWhenBare $ runReaderIO $ SQL.getAssociatedFiles k getAssociatedFiles k = emptyWhenBare $ runReaderIO AssociatedTable $
SQL.getAssociatedFiles k
{- Queries for associated files never return anything when in a bare {- Queries for associated files never return anything when in a bare
- repository, since without a work tree there can be no associated files. - repository, since without a work tree there can be no associated files.
@ -186,10 +193,12 @@ getAssociatedFilesIncluding afile k = emptyWhenBare $ do
{- Gets any keys that are on record as having a particular associated file. {- Gets any keys that are on record as having a particular associated file.
- (Should be one or none but the database doesn't enforce that.) -} - (Should be one or none but the database doesn't enforce that.) -}
getAssociatedKey :: TopFilePath -> Annex [Key] getAssociatedKey :: TopFilePath -> Annex [Key]
getAssociatedKey f = emptyWhenBare $ runReaderIO $ SQL.getAssociatedKey f getAssociatedKey f = emptyWhenBare $ runReaderIO AssociatedTable $
SQL.getAssociatedKey f
removeAssociatedFile :: Key -> TopFilePath -> Annex () removeAssociatedFile :: Key -> TopFilePath -> Annex ()
removeAssociatedFile k = runWriterIO . SQL.removeAssociatedFile k removeAssociatedFile k = runWriterIO AssociatedTable .
SQL.removeAssociatedFile k
{- Stats the files, and stores their InodeCaches. -} {- Stats the files, and stores their InodeCaches. -}
storeInodeCaches :: Key -> [RawFilePath] -> Annex () storeInodeCaches :: Key -> [RawFilePath] -> Annex ()
@ -198,7 +207,7 @@ storeInodeCaches k fs = withTSDelta $ \d ->
=<< liftIO (mapM (\f -> genInodeCache f d) fs) =<< liftIO (mapM (\f -> genInodeCache f d) fs)
addInodeCaches :: Key -> [InodeCache] -> Annex () addInodeCaches :: Key -> [InodeCache] -> Annex ()
addInodeCaches k is = runWriterIO $ SQL.addInodeCaches k is addInodeCaches k is = runWriterIO ContentTable $ SQL.addInodeCaches k is
{- A key may have multiple InodeCaches; one for the annex object, and one {- A key may have multiple InodeCaches; one for the annex object, and one
- for each pointer file that is a copy of it. - for each pointer file that is a copy of it.
@ -210,18 +219,19 @@ addInodeCaches k is = runWriterIO $ SQL.addInodeCaches k is
- for pointer files, but none recorded for the annex object. - for pointer files, but none recorded for the annex object.
-} -}
getInodeCaches :: Key -> Annex [InodeCache] getInodeCaches :: Key -> Annex [InodeCache]
getInodeCaches = runReaderIO . SQL.getInodeCaches getInodeCaches = runReaderIO ContentTable . SQL.getInodeCaches
{- Remove all inodes cached for a key. -} {- Remove all inodes cached for a key. -}
removeInodeCaches :: Key -> Annex () removeInodeCaches :: Key -> Annex ()
removeInodeCaches = runWriterIO . SQL.removeInodeCaches removeInodeCaches = runWriterIO ContentTable . SQL.removeInodeCaches
{- Remove cached inodes, for any key. -} {- Remove cached inodes, for any key. -}
removeInodeCache :: InodeCache -> Annex () removeInodeCache :: InodeCache -> Annex ()
removeInodeCache = runWriterIO . SQL.removeInodeCache removeInodeCache = runWriterIO ContentTable . SQL.removeInodeCache
isInodeKnown :: InodeCache -> SentinalStatus -> Annex Bool isInodeKnown :: InodeCache -> SentinalStatus -> Annex Bool
isInodeKnown i s = or <$> runReaderIO ((:[]) <$$> SQL.isInodeKnown i s) isInodeKnown i s = or <$> runReaderIO ContentTable
((:[]) <$$> SQL.isInodeKnown i s)
{- Looks at staged changes to annexed files, and updates the keys database, {- Looks at staged changes to annexed files, and updates the keys database,
- so that its information is consistent with the state of the repository. - so that its information is consistent with the state of the repository.
@ -250,18 +260,21 @@ isInodeKnown i s = or <$> runReaderIO ((:[]) <$$> SQL.isInodeKnown i s)
- So when using getAssociatedFiles, have to make sure the file still - So when using getAssociatedFiles, have to make sure the file still
- is an associated file. - is an associated file.
-} -}
reconcileStaged :: H.DbQueue -> Annex () reconcileStaged :: H.DbQueue -> Annex DbTablesChanged
reconcileStaged qh = unlessM (Git.Config.isBare <$> gitRepo) $ do reconcileStaged qh = ifM (Git.Config.isBare <$> gitRepo)
( return mempty
, do
gitindex <- inRepo currentIndexFile gitindex <- inRepo currentIndexFile
indexcache <- fromRawFilePath <$> calcRepo' gitAnnexKeysDbIndexCache indexcache <- fromRawFilePath <$> calcRepo' gitAnnexKeysDbIndexCache
withTSDelta (liftIO . genInodeCache gitindex) >>= \case withTSDelta (liftIO . genInodeCache gitindex) >>= \case
Just cur -> readindexcache indexcache >>= \case Just cur -> readindexcache indexcache >>= \case
Nothing -> go cur indexcache =<< getindextree Nothing -> go cur indexcache =<< getindextree
Just prev -> ifM (compareInodeCaches prev cur) Just prev -> ifM (compareInodeCaches prev cur)
( noop ( return mempty
, go cur indexcache =<< getindextree , go cur indexcache =<< getindextree
) )
Nothing -> noop Nothing -> return mempty
)
where where
lastindexref = Ref "refs/annex/last-index" lastindexref = Ref "refs/annex/last-index"
@ -286,6 +299,7 @@ reconcileStaged qh = unlessM (Git.Config.isBare <$> gitRepo) $ do
-- against next time. -- against next time.
inRepo $ update' lastindexref newtree inRepo $ update' lastindexref newtree
fastDebug "Database.Keys" "reconcileStaged end" fastDebug "Database.Keys" "reconcileStaged end"
return (DbTablesChanged True True)
-- git write-tree will fail if the index is locked or when there is -- git write-tree will fail if the index is locked or when there is
-- a merge conflict. To get up-to-date with the current index, -- a merge conflict. To get up-to-date with the current index,
-- diff --staged with the old index tree. The current index tree -- diff --staged with the old index tree. The current index tree
@ -307,6 +321,7 @@ reconcileStaged qh = unlessM (Git.Config.isBare <$> gitRepo) $ do
void $ updatetodiff g Nothing "--staged" void $ updatetodiff g Nothing "--staged"
(procmergeconflictdiff mdfeeder) (procmergeconflictdiff mdfeeder)
fastDebug "Database.Keys" "reconcileStaged end" fastDebug "Database.Keys" "reconcileStaged end"
return (DbTablesChanged True True)
updatetodiff g old new processor = do updatetodiff g old new processor = do
(l, cleanup) <- pipeNullSplit' (diff old new) g (l, cleanup) <- pipeNullSplit' (diff old new) g
@ -482,3 +497,9 @@ reconcileStaged qh = unlessM (Git.Config.isBare <$> gitRepo) $ do
largediff :: Int largediff :: Int
largediff = 1000 largediff = 1000
{- Normally the keys database is updated incrementally when opened,
- by reconcileStaged. Calling this explicitly allows running the
- update at an earlier point.
-}
updateDatabase :: Annex ()
updateDatabase = runWriter ContentTable (const noop)

View file

@ -15,6 +15,7 @@ module Database.Keys.Handle (
) where ) where
import qualified Database.Queue as H import qualified Database.Queue as H
import Database.Keys.Tables
import Utility.Exception import Utility.Exception
import Utility.DebugLocks import Utility.DebugLocks
@ -29,7 +30,7 @@ newtype DbHandle = DbHandle (MVar DbState)
-- The database can be closed or open, but it also may have been -- The database can be closed or open, but it also may have been
-- tried to open (for read) and didn't exist yet or is not readable. -- tried to open (for read) and didn't exist yet or is not readable.
data DbState = DbClosed | DbOpen H.DbQueue | DbUnavailable data DbState = DbClosed | DbOpen (H.DbQueue, DbTablesChanged) | DbUnavailable
newDbHandle :: IO DbHandle newDbHandle :: IO DbHandle
newDbHandle = DbHandle <$> newMVar DbClosed newDbHandle = DbHandle <$> newMVar DbClosed
@ -52,15 +53,17 @@ withDbState (DbHandle mvar) a = do
return v return v
flushDbQueue :: DbHandle -> IO () flushDbQueue :: DbHandle -> IO ()
flushDbQueue (DbHandle mvar) = go =<< debugLocks (readMVar mvar) flushDbQueue h = withDbState h go
where where
go (DbOpen qh) = H.flushDbQueue qh go (DbOpen (qh, _)) = do
go _ = return () H.flushDbQueue qh
return ((), DbOpen (qh, mempty))
go st = return ((), st)
closeDbHandle :: DbHandle -> IO () closeDbHandle :: DbHandle -> IO ()
closeDbHandle h = withDbState h go closeDbHandle h = withDbState h go
where where
go (DbOpen qh) = do go (DbOpen (qh, _)) = do
H.closeDbQueue qh H.closeDbQueue qh
return ((), DbClosed) return ((), DbClosed)
go st = return ((), st) go st = return ((), st)

38
Database/Keys/Tables.hs Normal file
View file

@ -0,0 +1,38 @@
{- Keeping track of which tables in the keys database have changed
-
- Copyright 2022 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
module Database.Keys.Tables where
import Data.Monoid
import qualified Data.Semigroup as Sem
import Prelude
data DbTable = AssociatedTable | ContentTable
deriving (Eq, Show)
data DbTablesChanged = DbTablesChanged
{ associatedTable :: Bool
, contentTable :: Bool
}
deriving (Show)
instance Sem.Semigroup DbTablesChanged where
a <> b = DbTablesChanged
{ associatedTable = associatedTable a || associatedTable b
, contentTable = contentTable a || contentTable b
}
instance Monoid DbTablesChanged where
mempty = DbTablesChanged False False
addDbTable :: DbTablesChanged -> DbTable -> DbTablesChanged
addDbTable ts AssociatedTable = ts { associatedTable = True }
addDbTable ts ContentTable = ts { contentTable = True }
isDbTableChanged :: DbTablesChanged -> DbTable -> Bool
isDbTableChanged ts AssociatedTable = associatedTable ts
isDbTableChanged ts ContentTable = contentTable ts

View file

@ -16,6 +16,7 @@ import Types.Upgrade
import Annex.CatFile import Annex.CatFile
import qualified Database.Keys import qualified Database.Keys
import qualified Database.Keys.SQL import qualified Database.Keys.SQL
import Database.Keys.Tables
import qualified Git.LsFiles as LsFiles import qualified Git.LsFiles as LsFiles
import qualified Git import qualified Git
import Git.FilePath import Git.FilePath
@ -114,8 +115,9 @@ populateKeysDb = unlessM isBareRepo $ do
Nothing -> noop Nothing -> noop
Just k -> do Just k -> do
topf <- inRepo $ toTopFilePath $ toRawFilePath f topf <- inRepo $ toTopFilePath $ toRawFilePath f
Database.Keys.runWriter $ \h -> liftIO $ do Database.Keys.runWriter AssociatedTable $ \h -> liftIO $
Database.Keys.SQL.addAssociatedFile k topf h Database.Keys.SQL.addAssociatedFile k topf h
Database.Keys.runWriter ContentTable $ \h -> liftIO $
Database.Keys.SQL.addInodeCaches k [ic] h Database.Keys.SQL.addInodeCaches k [ic] h
liftIO $ void cleanup liftIO $ void cleanup
Database.Keys.closeDb Database.Keys.closeDb

View file

@ -830,6 +830,7 @@ Executable git-annex
Database.Init Database.Init
Database.Keys Database.Keys
Database.Keys.Handle Database.Keys.Handle
Database.Keys.Tables
Database.Keys.SQL Database.Keys.SQL
Database.Queue Database.Queue
Database.Types Database.Types