From f9c5aa84e058d8011f3fcdb82687a325cda8586e Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 12 Jan 2016 13:01:44 -0400 Subject: [PATCH] add database benchmark MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The benchmark shows that the database access is quite fast indeed! And, it scales linearly to the number of keys, with one exception, getAssociatedKey. Based on this benchmark, I don't think I need worry about optimising for cases where all files are locked and the database is mostly empty. In those cases, database access will be misses, and according to this benchmark, should add only 50 milliseconds to runtime. (NB: There may be some overhead to getting the database opened and locking the handle that this benchmark doesn't see.) joey@darkstar:~/src/git-annex>./git-annex benchmark setting up database with 1000 setting up database with 10000 benchmarking keys database/getAssociatedFiles from 1000 (hit) time 62.77 μs (62.70 μs .. 62.85 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 62.81 μs (62.76 μs .. 62.88 μs) std dev 201.6 ns (157.5 ns .. 259.5 ns) benchmarking keys database/getAssociatedFiles from 1000 (miss) time 50.02 μs (49.97 μs .. 50.07 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 50.09 μs (50.04 μs .. 50.17 μs) std dev 206.7 ns (133.8 ns .. 295.3 ns) benchmarking keys database/getAssociatedKey from 1000 (hit) time 211.2 μs (210.5 μs .. 212.3 μs) 1.000 R² (0.999 R² .. 1.000 R²) mean 211.0 μs (210.7 μs .. 212.0 μs) std dev 1.685 μs (334.4 ns .. 3.517 μs) benchmarking keys database/getAssociatedKey from 1000 (miss) time 173.5 μs (172.7 μs .. 174.2 μs) 1.000 R² (0.999 R² .. 1.000 R²) mean 173.7 μs (173.0 μs .. 175.5 μs) std dev 3.833 μs (1.858 μs .. 6.617 μs) variance introduced by outliers: 16% (moderately inflated) benchmarking keys database/getAssociatedFiles from 10000 (hit) time 64.01 μs (63.84 μs .. 64.18 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 64.85 μs (64.34 μs .. 66.02 μs) std dev 2.433 μs (547.6 ns .. 4.652 μs) variance introduced by outliers: 40% (moderately inflated) benchmarking keys database/getAssociatedFiles from 10000 (miss) time 50.33 μs (50.28 μs .. 50.39 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 50.32 μs (50.26 μs .. 50.38 μs) std dev 202.7 ns (167.6 ns .. 252.0 ns) benchmarking keys database/getAssociatedKey from 10000 (hit) time 1.142 ms (1.139 ms .. 1.146 ms) 1.000 R² (1.000 R² .. 1.000 R²) mean 1.142 ms (1.140 ms .. 1.144 ms) std dev 7.142 μs (4.994 μs .. 10.98 μs) benchmarking keys database/getAssociatedKey from 10000 (miss) time 1.094 ms (1.092 ms .. 1.096 ms) 1.000 R² (1.000 R² .. 1.000 R²) mean 1.095 ms (1.095 ms .. 1.097 ms) std dev 4.277 μs (2.591 μs .. 7.228 μs) --- CmdLine/GitAnnex.hs | 6 +++ Command/Benchmark.hs | 106 +++++++++++++++++++++++++++++++++++++++++++ Common.hs | 2 +- Database/Keys.hs | 5 +- Database/Keys/SQL.hs | 7 +++ doc/git-annex.mdwn | 5 ++ doc/todo/smudge.mdwn | 8 ---- git-annex.cabal | 8 ++++ 8 files changed, 135 insertions(+), 12 deletions(-) create mode 100644 Command/Benchmark.hs diff --git a/CmdLine/GitAnnex.hs b/CmdLine/GitAnnex.hs index ba7689f702..0383dada34 100644 --- a/CmdLine/GitAnnex.hs +++ b/CmdLine/GitAnnex.hs @@ -115,6 +115,9 @@ import qualified Command.Test import qualified Command.FuzzTest import qualified Command.TestRemote #endif +#ifdef WITH_BENCHMARK +import qualified Command.Benchmark +#endif #ifdef WITH_EKG import System.Remote.Monitoring #endif @@ -220,6 +223,9 @@ cmds testoptparser testrunner = #ifdef WITH_TESTSUITE , Command.FuzzTest.cmd , Command.TestRemote.cmd +#endif +#ifdef WITH_BENCHMARK + , Command.Benchmark.cmd #endif ] diff --git a/Command/Benchmark.hs b/Command/Benchmark.hs new file mode 100644 index 0000000000..d4585fdb88 --- /dev/null +++ b/Command/Benchmark.hs @@ -0,0 +1,106 @@ +{- git-annex benchmark + - + - Copyright 2016 Joey Hess + - + - Licensed under the GNU GPL version 3 or higher. + -} + +{-# OPTIONS_GHC -fno-warn-orphans #-} + +module Command.Benchmark where + +import Command +import Database.Types +import qualified Database.Keys.SQL as SQL +import qualified Database.Queue as H +import Utility.Tmp +import Git.FilePath + +import Criterion.Main +import Criterion.Internal (runAndAnalyse) +import Criterion.Monad +import Control.Monad.IO.Class (liftIO) +import Control.Monad +import Control.DeepSeq +import System.FilePath +import System.Random + +cmd :: Command +cmd = noRepo (withParams benchmark) $ + dontCheck repoExists $ + command "benchmark" SectionTesting + "run benchmarks" + paramNothing + (withParams (liftIO . benchmark)) + +benchmark :: CmdParams -> IO () +benchmark _ = withTmpDirIn "." "benchmark" $ \tmpdir -> do + -- benchmark different sizes of databases + dbs <- mapM (benchDb tmpdir) + [ 1000 + , 10000 + -- , 100000 + ] + -- can't use Criterion's defaultMain here because it looks at + -- command-line parameters + withConfig defaultConfig $ runAndAnalyse (const True) $ + bgroup "keys database" $ flip concatMap dbs $ \db -> + [ getAssociatedFilesHitBench db + , getAssociatedFilesMissBench db + , getAssociatedKeyHitBench db + , getAssociatedKeyMissBench db + ] + +getAssociatedFilesHitBench :: BenchDb -> Benchmark +getAssociatedFilesHitBench ( BenchDb h num) = bench ("getAssociatedFiles from " ++ show num ++ " (hit)") $ nfIO $ do + n <- getStdRandom (randomR (1,num)) + SQL.getAssociatedFiles (keyN n) (SQL.ReadHandle h) + +getAssociatedFilesMissBench :: BenchDb -> Benchmark +getAssociatedFilesMissBench ( BenchDb h num) = bench ("getAssociatedFiles from " ++ show num ++ " (miss)") $ nfIO $ + SQL.getAssociatedFiles keyMiss (SQL.ReadHandle h) + +getAssociatedKeyHitBench :: BenchDb -> Benchmark +getAssociatedKeyHitBench (BenchDb h num) = bench ("getAssociatedKey from " ++ show num ++ " (hit)") $ nfIO $ do + n <- getStdRandom (randomR (1,num)) + SQL.getAssociatedKey (fileN n) (SQL.ReadHandle h) + +getAssociatedKeyMissBench :: BenchDb -> Benchmark +getAssociatedKeyMissBench (BenchDb h num) = bench ("getAssociatedKey from " ++ show num ++ " (miss)") $ nfIO $ + SQL.getAssociatedKey fileMiss (SQL.ReadHandle h) + +populateAssociatedFiles :: H.DbQueue -> Int -> IO () +populateAssociatedFiles h num = do + forM_ [1..num] $ \n -> + SQL.addAssociatedFile (keyN n) (fileN n) (SQL.WriteHandle h) + H.flushDbQueue h + +keyN :: Int -> SKey +keyN n = SKey ("key" ++ show n) + +fileN :: Int -> TopFilePath +fileN n = asTopFilePath ("file" ++ show n) + +keyMiss :: SKey +keyMiss = keyN 0 -- 0 is never stored + +fileMiss :: TopFilePath +fileMiss = fileN 0 -- 0 is never stored + +data BenchDb = BenchDb H.DbQueue Int + +benchDb :: FilePath -> Int -> IO BenchDb +benchDb tmpdir num = do + putStrLn $ "setting up database with " ++ show num + H.initDb f SQL.createTables + h <- H.openDbQueue f SQL.containedTable + populateAssociatedFiles h num + return (BenchDb h num) + where + f = tmpdir "db" ++ show num + +instance NFData TopFilePath where + rnf = rnf . getTopFilePath + +instance NFData SKey where + rnf (SKey s) = rnf s diff --git a/Common.hs b/Common.hs index 8272043c2a..2b8b9a8053 100644 --- a/Common.hs +++ b/Common.hs @@ -5,7 +5,7 @@ module Common (module X) where import Control.Monad as X import Control.Monad.IfElse as X import Control.Applicative as X -import "mtl" Control.Monad.State.Strict as X (liftIO) +import Control.Monad.IO.Class as X (liftIO) import Data.Maybe as X import Data.List as X hiding (head, tail, init, last) diff --git a/Database/Keys.hs b/Database/Keys.hs index a711ba7caa..faf34132a0 100644 --- a/Database/Keys.hs +++ b/Database/Keys.hs @@ -127,14 +127,13 @@ openDb createdb _ = withExclusiveLock gitAnnexKeysDbLock $ do (False, True) -> do liftIO $ do createDirectoryIfMissing True dbdir - H.initDb db $ void $ - runMigrationSilent SQL.migrateKeysDb + H.initDb db SQL.createTables setAnnexDirPerm dbdir setAnnexFilePerm db open db (False, False) -> return DbEmpty where - open db = liftIO $ DbOpen <$> H.openDbQueue db "content" + open db = liftIO $ DbOpen <$> H.openDbQueue db SQL.containedTable addAssociatedFile :: Key -> TopFilePath -> Annex () addAssociatedFile k f = runWriterIO $ SQL.addAssociatedFile (toSKey k) f diff --git a/Database/Keys/SQL.hs b/Database/Keys/SQL.hs index 22bcb86a18..6862b15d9b 100644 --- a/Database/Keys/SQL.hs +++ b/Database/Keys/SQL.hs @@ -13,6 +13,7 @@ module Database.Keys.SQL where import Database.Types +import Database.Handle import qualified Database.Queue as H import Utility.InodeCache import Git.FilePath @@ -33,6 +34,12 @@ Content KeyCacheIndex key cache |] +containedTable :: TableName +containedTable = "content" + +createTables :: SqlPersistM () +createTables = void $ runMigrationSilent migrateKeysDb + newtype ReadHandle = ReadHandle H.DbQueue readDb :: SqlPersistM a -> ReadHandle -> IO a diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index 299428d1e5..329fb89327 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -672,6 +672,11 @@ subdirectories). See [[git-annex-fuzztest]](1) for details. +* `benchmark` + + This runs git-annex's built-in benchmarks, if it was built with + benchmarking support. + # COMMON OPTIONS These common options are accepted by all git-annex commands, and diff --git a/doc/todo/smudge.mdwn b/doc/todo/smudge.mdwn index 6498863e4a..5f3d521bf6 100644 --- a/doc/todo/smudge.mdwn +++ b/doc/todo/smudge.mdwn @@ -32,14 +32,6 @@ git-annex should use smudge/clean filters. when pushing changes committed in such a repo. Ideally, should avoid committing implicit unlocks, or should prevent such commits leaking out in pushes. -* Optimisation: See if the database schema can be improved to speed things - up. Are there enough indexes? getAssociatedKey in particular does a - reverse lookup and might benefit from an index. -* Optimisation: Reads from the Keys database avoid doing anything if the - database doesn't exist. This makes v5 repos, or v6 with all locked files - faster. However, if a v6 repo unlocks and then re-locks a file, its - database will exist, and so this optimisation will no longer apply. - Could try to detect when the database is empty, and remove it or avoid reads. * Eventually (but not yet), make v6 the default for new repositories. Note that the assistant forces repos into direct mode; that will need to diff --git a/git-annex.cabal b/git-annex.cabal index 5c09a0c610..289e7a2554 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -88,6 +88,10 @@ Flag EKG Description: Enable use of EKG to monitor git-annex as it runs (at http://localhost:4242/) Default: False +Flag Benchmark + Description: Enable benchmarking + Default: False + Flag network-uri Description: Get Network.URI from the network-uri package Default: True @@ -260,6 +264,10 @@ Executable git-annex Build-Depends: ekg GHC-Options: -with-rtsopts=-T CPP-Options: -DWITH_EKG + + if flag(Benchmark) + Build-Depends: criterion, deepseq + CPP-Options: -DWITH_BENCHMARK source-repository head type: git