git-annex/Database/Keys/SQL.hs

162 lines
5.5 KiB
Haskell
Raw Normal View History

2016-01-11 19:52:11 +00:00
{- Sqlite database of information about Keys
-
- Copyright 2015-2019 Joey Hess <id@joeyh.name>
2016-01-11 19:52:11 +00:00
-
- Licensed under the GNU AGPL version 3 or higher.
2016-01-11 19:52:11 +00:00
-}
{-# LANGUAGE CPP #-}
2016-01-11 19:52:11 +00:00
{-# LANGUAGE QuasiQuotes, TypeFamilies, TemplateHaskell #-}
{-# LANGUAGE OverloadedStrings, GADTs, FlexibleContexts #-}
{-# LANGUAGE MultiParamTypeClasses, GeneralizedNewtypeDeriving #-}
{-# LANGUAGE RankNTypes, ScopedTypeVariables #-}
{-# LANGUAGE UndecidableInstances #-}
#if MIN_VERSION_persistent_template(2,8,0)
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE StandaloneDeriving #-}
#endif
2016-01-11 19:52:11 +00:00
module Database.Keys.SQL where
import Database.Types
add database benchmark The benchmark shows that the database access is quite fast indeed! And, it scales linearly to the number of keys, with one exception, getAssociatedKey. Based on this benchmark, I don't think I need worry about optimising for cases where all files are locked and the database is mostly empty. In those cases, database access will be misses, and according to this benchmark, should add only 50 milliseconds to runtime. (NB: There may be some overhead to getting the database opened and locking the handle that this benchmark doesn't see.) joey@darkstar:~/src/git-annex>./git-annex benchmark setting up database with 1000 setting up database with 10000 benchmarking keys database/getAssociatedFiles from 1000 (hit) time 62.77 μs (62.70 μs .. 62.85 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 62.81 μs (62.76 μs .. 62.88 μs) std dev 201.6 ns (157.5 ns .. 259.5 ns) benchmarking keys database/getAssociatedFiles from 1000 (miss) time 50.02 μs (49.97 μs .. 50.07 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 50.09 μs (50.04 μs .. 50.17 μs) std dev 206.7 ns (133.8 ns .. 295.3 ns) benchmarking keys database/getAssociatedKey from 1000 (hit) time 211.2 μs (210.5 μs .. 212.3 μs) 1.000 R² (0.999 R² .. 1.000 R²) mean 211.0 μs (210.7 μs .. 212.0 μs) std dev 1.685 μs (334.4 ns .. 3.517 μs) benchmarking keys database/getAssociatedKey from 1000 (miss) time 173.5 μs (172.7 μs .. 174.2 μs) 1.000 R² (0.999 R² .. 1.000 R²) mean 173.7 μs (173.0 μs .. 175.5 μs) std dev 3.833 μs (1.858 μs .. 6.617 μs) variance introduced by outliers: 16% (moderately inflated) benchmarking keys database/getAssociatedFiles from 10000 (hit) time 64.01 μs (63.84 μs .. 64.18 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 64.85 μs (64.34 μs .. 66.02 μs) std dev 2.433 μs (547.6 ns .. 4.652 μs) variance introduced by outliers: 40% (moderately inflated) benchmarking keys database/getAssociatedFiles from 10000 (miss) time 50.33 μs (50.28 μs .. 50.39 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 50.32 μs (50.26 μs .. 50.38 μs) std dev 202.7 ns (167.6 ns .. 252.0 ns) benchmarking keys database/getAssociatedKey from 10000 (hit) time 1.142 ms (1.139 ms .. 1.146 ms) 1.000 R² (1.000 R² .. 1.000 R²) mean 1.142 ms (1.140 ms .. 1.144 ms) std dev 7.142 μs (4.994 μs .. 10.98 μs) benchmarking keys database/getAssociatedKey from 10000 (miss) time 1.094 ms (1.092 ms .. 1.096 ms) 1.000 R² (1.000 R² .. 1.000 R²) mean 1.095 ms (1.095 ms .. 1.097 ms) std dev 4.277 μs (2.591 μs .. 7.228 μs)
2016-01-12 17:01:44 +00:00
import Database.Handle
2016-01-11 19:52:11 +00:00
import qualified Database.Queue as H
import Utility.InodeCache
import Git.FilePath
import Database.Persist.Sql hiding (Key)
2016-01-11 19:52:11 +00:00
import Database.Persist.TH
import Data.Time.Clock
import Control.Monad
import Data.Maybe
2016-01-11 19:52:11 +00:00
-- Note on indexes: KeyFileIndex etc are really uniqueness constraints,
-- which cause sqlite to automatically add indexes. So when adding indexes,
-- have to take care to only add ones that work as uniqueness constraints.
-- (Unfortunatly persistent does not support indexes that are not
-- uniqueness constraints; https://github.com/yesodweb/persistent/issues/109)
--
-- KeyFileIndex contains both the key and the file because the combined
-- pair is unique, whereas the same key can appear in the table multiple
-- times with different files.
--
-- The other benefit to including the file in the index is that it makes
-- queries that include the file faster, since it's a covering index.
--
-- The KeyFileIndex only speeds up selects for a key, since it comes first.
-- To also speed up selects for a file, there's a separate FileKeyIndex.
2016-01-11 19:52:11 +00:00
share [mkPersist sqlSettings, mkMigrate "migrateKeysDb"] [persistLowerCase|
Associated
key Key
file SFilePath
2016-01-11 19:52:11 +00:00
KeyFileIndex key file
FileKeyIndex file key
2016-01-11 19:52:11 +00:00
Content
key Key
2019-10-30 17:02:16 +00:00
inodecache InodeCache
2019-10-30 19:16:03 +00:00
filesize FileSize
mtime EpochTime
2019-10-30 17:02:16 +00:00
KeyInodeCacheIndex key inodecache
InodeCacheKeyIndex inodecache key
2016-01-11 19:52:11 +00:00
|]
add database benchmark The benchmark shows that the database access is quite fast indeed! And, it scales linearly to the number of keys, with one exception, getAssociatedKey. Based on this benchmark, I don't think I need worry about optimising for cases where all files are locked and the database is mostly empty. In those cases, database access will be misses, and according to this benchmark, should add only 50 milliseconds to runtime. (NB: There may be some overhead to getting the database opened and locking the handle that this benchmark doesn't see.) joey@darkstar:~/src/git-annex>./git-annex benchmark setting up database with 1000 setting up database with 10000 benchmarking keys database/getAssociatedFiles from 1000 (hit) time 62.77 μs (62.70 μs .. 62.85 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 62.81 μs (62.76 μs .. 62.88 μs) std dev 201.6 ns (157.5 ns .. 259.5 ns) benchmarking keys database/getAssociatedFiles from 1000 (miss) time 50.02 μs (49.97 μs .. 50.07 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 50.09 μs (50.04 μs .. 50.17 μs) std dev 206.7 ns (133.8 ns .. 295.3 ns) benchmarking keys database/getAssociatedKey from 1000 (hit) time 211.2 μs (210.5 μs .. 212.3 μs) 1.000 R² (0.999 R² .. 1.000 R²) mean 211.0 μs (210.7 μs .. 212.0 μs) std dev 1.685 μs (334.4 ns .. 3.517 μs) benchmarking keys database/getAssociatedKey from 1000 (miss) time 173.5 μs (172.7 μs .. 174.2 μs) 1.000 R² (0.999 R² .. 1.000 R²) mean 173.7 μs (173.0 μs .. 175.5 μs) std dev 3.833 μs (1.858 μs .. 6.617 μs) variance introduced by outliers: 16% (moderately inflated) benchmarking keys database/getAssociatedFiles from 10000 (hit) time 64.01 μs (63.84 μs .. 64.18 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 64.85 μs (64.34 μs .. 66.02 μs) std dev 2.433 μs (547.6 ns .. 4.652 μs) variance introduced by outliers: 40% (moderately inflated) benchmarking keys database/getAssociatedFiles from 10000 (miss) time 50.33 μs (50.28 μs .. 50.39 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 50.32 μs (50.26 μs .. 50.38 μs) std dev 202.7 ns (167.6 ns .. 252.0 ns) benchmarking keys database/getAssociatedKey from 10000 (hit) time 1.142 ms (1.139 ms .. 1.146 ms) 1.000 R² (1.000 R² .. 1.000 R²) mean 1.142 ms (1.140 ms .. 1.144 ms) std dev 7.142 μs (4.994 μs .. 10.98 μs) benchmarking keys database/getAssociatedKey from 10000 (miss) time 1.094 ms (1.092 ms .. 1.096 ms) 1.000 R² (1.000 R² .. 1.000 R²) mean 1.095 ms (1.095 ms .. 1.097 ms) std dev 4.277 μs (2.591 μs .. 7.228 μs)
2016-01-12 17:01:44 +00:00
containedTable :: TableName
containedTable = "content"
createTables :: SqlPersistM ()
createTables = void $ runMigrationSilent migrateKeysDb
2016-01-11 19:52:11 +00:00
newtype ReadHandle = ReadHandle H.DbQueue
readDb :: SqlPersistM a -> ReadHandle -> IO a
readDb a (ReadHandle h) = H.queryDbQueue h a
newtype WriteHandle = WriteHandle H.DbQueue
queueDb :: SqlPersistM () -> WriteHandle -> IO ()
queueDb a (WriteHandle h) = H.queueDb h checkcommit a
where
-- commit queue after 1000 changes or 5 minutes, whichever comes first
checkcommit sz lastcommittime
| sz > 1000 = return True
| otherwise = do
now <- getCurrentTime
return $ diffUTCTime now lastcommittime > 300
2016-01-11 19:52:11 +00:00
addAssociatedFile :: Key -> TopFilePath -> WriteHandle -> IO ()
addAssociatedFile k f = queueDb $ do
2016-01-11 19:52:11 +00:00
-- If the same file was associated with a different key before,
-- remove that.
deleteWhere [AssociatedFile ==. af, AssociatedKey !=. k]
void $ insertUnique $ Associated k af
where
af = SFilePath (getTopFilePath f)
2016-01-11 19:52:11 +00:00
2016-10-17 18:58:33 +00:00
-- Does not remove any old association for a file, but less expensive
-- than addAssociatedFile. Calling dropAllAssociatedFiles first and then
-- this is an efficient way to update all associated files.
addAssociatedFileFast :: Key -> TopFilePath -> WriteHandle -> IO ()
addAssociatedFileFast k f = queueDb $ void $ insertUnique $ Associated k af
2016-10-17 18:58:33 +00:00
where
af = SFilePath (getTopFilePath f)
2016-10-17 18:58:33 +00:00
dropAllAssociatedFiles :: WriteHandle -> IO ()
dropAllAssociatedFiles = queueDb $
2018-11-04 20:46:39 +00:00
deleteWhere ([] :: [Filter Associated])
2016-10-17 18:58:33 +00:00
2016-01-11 19:52:11 +00:00
{- Note that the files returned were once associated with the key, but
- some of them may not be any longer. -}
getAssociatedFiles :: Key -> ReadHandle -> IO [TopFilePath]
getAssociatedFiles k = readDb $ do
l <- selectList [AssociatedKey ==. k] []
return $ map (asTopFilePath . (\(SFilePath f) -> f) . associatedFile . entityVal) l
2016-01-11 19:52:11 +00:00
{- Gets any keys that are on record as having a particular associated file.
- (Should be one or none but the database doesn't enforce that.) -}
getAssociatedKey :: TopFilePath -> ReadHandle -> IO [Key]
2016-01-11 19:52:11 +00:00
getAssociatedKey f = readDb $ do
2018-11-04 20:46:39 +00:00
l <- selectList [AssociatedFile ==. af] []
return $ map (associatedKey . entityVal) l
where
af = SFilePath (getTopFilePath f)
2016-01-11 19:52:11 +00:00
removeAssociatedFile :: Key -> TopFilePath -> WriteHandle -> IO ()
removeAssociatedFile k f = queueDb $
deleteWhere [AssociatedKey ==. k, AssociatedFile ==. af]
where
af = SFilePath (getTopFilePath f)
2016-01-11 19:52:11 +00:00
addInodeCaches :: Key -> [InodeCache] -> WriteHandle -> IO ()
addInodeCaches k is = queueDb $
2019-10-30 19:16:03 +00:00
forM_ is $ \i -> insertUnique $ Content k i
(inodeCacheToFileSize i)
(inodeCacheToEpochTime i)
2016-01-11 19:52:11 +00:00
{- A key may have multiple InodeCaches; one for the annex object, and one
- for each pointer file that is a copy of it. -}
getInodeCaches :: Key -> ReadHandle -> IO [InodeCache]
getInodeCaches k = readDb $ do
l <- selectList [ContentKey ==. k] []
2019-10-30 17:02:16 +00:00
return $ map (contentInodecache . entityVal) l
2016-01-11 19:52:11 +00:00
removeInodeCaches :: Key -> WriteHandle -> IO ()
removeInodeCaches k = queueDb $
deleteWhere [ContentKey ==. k]
2019-10-30 19:16:03 +00:00
{- Check if the inode is known to be used for an annexed file. -}
isInodeKnown :: InodeCache -> SentinalStatus -> ReadHandle -> IO Bool
2019-10-30 19:16:03 +00:00
isInodeKnown i s = readDb (isJust <$> selectFirst q [])
where
2019-10-30 19:16:03 +00:00
q
| sentinalInodesChanged s =
2019-10-30 19:16:03 +00:00
-- Note that this select is intentionally not
-- indexed. Normally, the inodes have not changed,
-- and it would be unncessary work to maintain
-- indexes for the unusual case.
[ ContentFilesize ==. inodeCacheToFileSize i
, ContentMtime >=. tmin
, ContentMtime <=. tmax
]
| otherwise = [ContentInodecache ==. i]
(tmin, tmax) = inodeCacheEpochTimeRange i