git-annex/Database/Keys/SQL.hs

161 lines
5.2 KiB
Haskell
Raw Normal View History

2016-01-11 19:52:11 +00:00
{- Sqlite database of information about Keys
-
- Copyright 2015-2022 Joey Hess <id@joeyh.name>
2016-01-11 19:52:11 +00:00
-
- Licensed under the GNU AGPL version 3 or higher.
2016-01-11 19:52:11 +00:00
-}
{-# LANGUAGE CPP #-}
2016-01-11 19:52:11 +00:00
{-# LANGUAGE QuasiQuotes, TypeFamilies, TemplateHaskell #-}
{-# LANGUAGE OverloadedStrings, GADTs, FlexibleContexts #-}
{-# LANGUAGE MultiParamTypeClasses, GeneralizedNewtypeDeriving #-}
{-# LANGUAGE RankNTypes, ScopedTypeVariables #-}
{-# LANGUAGE DataKinds, FlexibleInstances #-}
{-# LANGUAGE UndecidableInstances #-}
#if MIN_VERSION_persistent_template(2,8,0)
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE StandaloneDeriving #-}
#endif
2016-01-11 19:52:11 +00:00
module Database.Keys.SQL where
import Database.Types
add database benchmark The benchmark shows that the database access is quite fast indeed! And, it scales linearly to the number of keys, with one exception, getAssociatedKey. Based on this benchmark, I don't think I need worry about optimising for cases where all files are locked and the database is mostly empty. In those cases, database access will be misses, and according to this benchmark, should add only 50 milliseconds to runtime. (NB: There may be some overhead to getting the database opened and locking the handle that this benchmark doesn't see.) joey@darkstar:~/src/git-annex>./git-annex benchmark setting up database with 1000 setting up database with 10000 benchmarking keys database/getAssociatedFiles from 1000 (hit) time 62.77 μs (62.70 μs .. 62.85 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 62.81 μs (62.76 μs .. 62.88 μs) std dev 201.6 ns (157.5 ns .. 259.5 ns) benchmarking keys database/getAssociatedFiles from 1000 (miss) time 50.02 μs (49.97 μs .. 50.07 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 50.09 μs (50.04 μs .. 50.17 μs) std dev 206.7 ns (133.8 ns .. 295.3 ns) benchmarking keys database/getAssociatedKey from 1000 (hit) time 211.2 μs (210.5 μs .. 212.3 μs) 1.000 R² (0.999 R² .. 1.000 R²) mean 211.0 μs (210.7 μs .. 212.0 μs) std dev 1.685 μs (334.4 ns .. 3.517 μs) benchmarking keys database/getAssociatedKey from 1000 (miss) time 173.5 μs (172.7 μs .. 174.2 μs) 1.000 R² (0.999 R² .. 1.000 R²) mean 173.7 μs (173.0 μs .. 175.5 μs) std dev 3.833 μs (1.858 μs .. 6.617 μs) variance introduced by outliers: 16% (moderately inflated) benchmarking keys database/getAssociatedFiles from 10000 (hit) time 64.01 μs (63.84 μs .. 64.18 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 64.85 μs (64.34 μs .. 66.02 μs) std dev 2.433 μs (547.6 ns .. 4.652 μs) variance introduced by outliers: 40% (moderately inflated) benchmarking keys database/getAssociatedFiles from 10000 (miss) time 50.33 μs (50.28 μs .. 50.39 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 50.32 μs (50.26 μs .. 50.38 μs) std dev 202.7 ns (167.6 ns .. 252.0 ns) benchmarking keys database/getAssociatedKey from 10000 (hit) time 1.142 ms (1.139 ms .. 1.146 ms) 1.000 R² (1.000 R² .. 1.000 R²) mean 1.142 ms (1.140 ms .. 1.144 ms) std dev 7.142 μs (4.994 μs .. 10.98 μs) benchmarking keys database/getAssociatedKey from 10000 (miss) time 1.094 ms (1.092 ms .. 1.096 ms) 1.000 R² (1.000 R² .. 1.000 R²) mean 1.095 ms (1.095 ms .. 1.097 ms) std dev 4.277 μs (2.591 μs .. 7.228 μs)
2016-01-12 17:01:44 +00:00
import Database.Handle
import Database.Utility
2016-01-11 19:52:11 +00:00
import qualified Database.Queue as H
import Utility.InodeCache
import Git.FilePath
import Database.Persist.Sql hiding (Key)
2016-01-11 19:52:11 +00:00
import Database.Persist.TH
import Control.Monad
import Data.Maybe
2016-01-11 19:52:11 +00:00
-- Note on indexes: KeyFileIndex etc are really uniqueness constraints,
-- which cause sqlite to automatically add indexes. So when adding indexes,
-- have to take care to only add ones that work as uniqueness constraints.
2023-03-14 02:39:16 +00:00
-- (Unfortunately persistent does not support indexes that are not
-- uniqueness constraints; https://github.com/yesodweb/persistent/issues/109)
--
-- To speed up queries for a key, there's KeyFileIndex,
-- which makes there be a covering index for keys.
--
-- FileKeyIndex speeds up queries that include the file, since
-- it makes there be a covering index for files. Note that, despite the name, it is
-- used as a uniqueness constraint now.
2016-01-11 19:52:11 +00:00
share [mkPersist sqlSettings, mkMigrate "migrateKeysDb"] [persistLowerCase|
Associated
key Key
file SFilePath
2016-01-11 19:52:11 +00:00
KeyFileIndex key file
FileKeyIndex file key
2016-01-11 19:52:11 +00:00
Content
key Key
2019-10-30 17:02:16 +00:00
inodecache InodeCache
2019-10-30 19:16:03 +00:00
filesize FileSize
mtime EpochTime
2019-10-30 17:02:16 +00:00
KeyInodeCacheIndex key inodecache
InodeCacheKeyIndex inodecache key
2016-01-11 19:52:11 +00:00
|]
add database benchmark The benchmark shows that the database access is quite fast indeed! And, it scales linearly to the number of keys, with one exception, getAssociatedKey. Based on this benchmark, I don't think I need worry about optimising for cases where all files are locked and the database is mostly empty. In those cases, database access will be misses, and according to this benchmark, should add only 50 milliseconds to runtime. (NB: There may be some overhead to getting the database opened and locking the handle that this benchmark doesn't see.) joey@darkstar:~/src/git-annex>./git-annex benchmark setting up database with 1000 setting up database with 10000 benchmarking keys database/getAssociatedFiles from 1000 (hit) time 62.77 μs (62.70 μs .. 62.85 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 62.81 μs (62.76 μs .. 62.88 μs) std dev 201.6 ns (157.5 ns .. 259.5 ns) benchmarking keys database/getAssociatedFiles from 1000 (miss) time 50.02 μs (49.97 μs .. 50.07 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 50.09 μs (50.04 μs .. 50.17 μs) std dev 206.7 ns (133.8 ns .. 295.3 ns) benchmarking keys database/getAssociatedKey from 1000 (hit) time 211.2 μs (210.5 μs .. 212.3 μs) 1.000 R² (0.999 R² .. 1.000 R²) mean 211.0 μs (210.7 μs .. 212.0 μs) std dev 1.685 μs (334.4 ns .. 3.517 μs) benchmarking keys database/getAssociatedKey from 1000 (miss) time 173.5 μs (172.7 μs .. 174.2 μs) 1.000 R² (0.999 R² .. 1.000 R²) mean 173.7 μs (173.0 μs .. 175.5 μs) std dev 3.833 μs (1.858 μs .. 6.617 μs) variance introduced by outliers: 16% (moderately inflated) benchmarking keys database/getAssociatedFiles from 10000 (hit) time 64.01 μs (63.84 μs .. 64.18 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 64.85 μs (64.34 μs .. 66.02 μs) std dev 2.433 μs (547.6 ns .. 4.652 μs) variance introduced by outliers: 40% (moderately inflated) benchmarking keys database/getAssociatedFiles from 10000 (miss) time 50.33 μs (50.28 μs .. 50.39 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 50.32 μs (50.26 μs .. 50.38 μs) std dev 202.7 ns (167.6 ns .. 252.0 ns) benchmarking keys database/getAssociatedKey from 10000 (hit) time 1.142 ms (1.139 ms .. 1.146 ms) 1.000 R² (1.000 R² .. 1.000 R²) mean 1.142 ms (1.140 ms .. 1.144 ms) std dev 7.142 μs (4.994 μs .. 10.98 μs) benchmarking keys database/getAssociatedKey from 10000 (miss) time 1.094 ms (1.092 ms .. 1.096 ms) 1.000 R² (1.000 R² .. 1.000 R²) mean 1.095 ms (1.095 ms .. 1.097 ms) std dev 4.277 μs (2.591 μs .. 7.228 μs)
2016-01-12 17:01:44 +00:00
containedTable :: TableName
containedTable = "content"
createTables :: SqlPersistM ()
createTables = void $ runMigrationSilent migrateKeysDb
2016-01-11 19:52:11 +00:00
newtype ReadHandle = ReadHandle H.DbQueue
readDb :: SqlPersistM a -> ReadHandle -> IO a
readDb a (ReadHandle h) = H.queryDbQueue h a
newtype WriteHandle = WriteHandle H.DbQueue
queueDb :: SqlPersistM () -> WriteHandle -> IO ()
queueDb a (WriteHandle h) = H.queueDb h checkcommit a
where
-- commit queue after 10000 changes
checkcommit sz _lastcommittime = pure (sz > 10000)
2016-01-11 19:52:11 +00:00
-- Insert the associated file.
-- When the file was associated with a different key before,
-- update it to the new key.
addAssociatedFile :: Key -> TopFilePath -> WriteHandle -> IO ()
addAssociatedFile k f = queueDb $
void $ upsertBy
(FileKeyIndex af k)
(Associated k af)
[AssociatedFile =. af, AssociatedKey =. k]
where
af = SFilePath (getTopFilePath f)
2016-01-11 19:52:11 +00:00
-- Faster than addAssociatedFile, but only safe to use when the file
-- was not associated with a different key before, as it does not delete
-- any old key.
newAssociatedFile :: Key -> TopFilePath -> WriteHandle -> IO ()
newAssociatedFile k f = queueDb $
insert_ $ Associated k af
where
af = SFilePath (getTopFilePath f)
2016-01-11 19:52:11 +00:00
{- Note that the files returned were once associated with the key, but
- some of them may not be any longer. -}
getAssociatedFiles :: Key -> ReadHandle -> IO [TopFilePath]
getAssociatedFiles k = readDb $ do
l <- selectList [AssociatedKey ==. k] []
return $ map (asTopFilePath . (\(SFilePath f) -> f) . associatedFile . entityVal) l
2016-01-11 19:52:11 +00:00
{- Gets any keys that are on record as having a particular associated file.
- (Should be one or none.) -}
getAssociatedKey :: TopFilePath -> ReadHandle -> IO [Key]
2016-01-11 19:52:11 +00:00
getAssociatedKey f = readDb $ do
2018-11-04 20:46:39 +00:00
l <- selectList [AssociatedFile ==. af] []
return $ map (associatedKey . entityVal) l
where
af = SFilePath (getTopFilePath f)
2016-01-11 19:52:11 +00:00
removeAssociatedFile :: Key -> TopFilePath -> WriteHandle -> IO ()
removeAssociatedFile k f = queueDb $
deleteWhere [AssociatedKey ==. k, AssociatedFile ==. af]
where
af = SFilePath (getTopFilePath f)
2016-01-11 19:52:11 +00:00
addInodeCaches :: Key -> [InodeCache] -> WriteHandle -> IO ()
addInodeCaches k is = queueDb $
forM_ is $ \i -> insertUniqueFast $ Content k i
2019-10-30 19:16:03 +00:00
(inodeCacheToFileSize i)
(inodeCacheToEpochTime i)
2016-01-11 19:52:11 +00:00
{- A key may have multiple InodeCaches; one for the annex object, and one
- for each pointer file that is a copy of it. -}
getInodeCaches :: Key -> ReadHandle -> IO [InodeCache]
getInodeCaches k = readDb $ do
l <- selectList [ContentKey ==. k] []
2019-10-30 17:02:16 +00:00
return $ map (contentInodecache . entityVal) l
2016-01-11 19:52:11 +00:00
removeInodeCaches :: Key -> WriteHandle -> IO ()
removeInodeCaches k = queueDb $
deleteWhere [ContentKey ==. k]
removeInodeCache :: InodeCache -> WriteHandle -> IO ()
removeInodeCache i = queueDb $ deleteWhere
[ ContentInodecache ==. i
]
2019-10-30 19:16:03 +00:00
{- Check if the inode is known to be used for an annexed file. -}
isInodeKnown :: InodeCache -> SentinalStatus -> ReadHandle -> IO Bool
2019-10-30 19:16:03 +00:00
isInodeKnown i s = readDb (isJust <$> selectFirst q [])
where
2019-10-30 19:16:03 +00:00
q
| sentinalInodesChanged s =
2019-10-30 19:16:03 +00:00
-- Note that this select is intentionally not
-- indexed. Normally, the inodes have not changed,
-- and it would be unnecessary work to maintain
2019-10-30 19:16:03 +00:00
-- indexes for the unusual case.
[ ContentFilesize ==. inodeCacheToFileSize i
, ContentMtime >=. tmin
, ContentMtime <=. tmax
]
| otherwise = [ContentInodecache ==. i]
(tmin, tmax) = inodeCacheEpochTimeRange i