git-annex/Database/Keys/SQL.hs

{- Sqlite database of information about Keys
 -
 - Copyright 2015-2016 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU GPL version 3 or higher.
 -}

{-# LANGUAGE QuasiQuotes, TypeFamilies, TemplateHaskell #-}
{-# LANGUAGE OverloadedStrings, GADTs, FlexibleContexts #-}
{-# LANGUAGE MultiParamTypeClasses, GeneralizedNewtypeDeriving #-}
{-# LANGUAGE RankNTypes, ScopedTypeVariables #-}

module Database.Keys.SQL where

import Database.Types
import Database.Handle
import qualified Database.Queue as H
import Utility.InodeCache
import Git.FilePath

import Database.Persist.TH
import Database.Esqueleto hiding (Key)
import Data.Time.Clock
import Control.Monad

share [mkPersist sqlSettings, mkMigrate "migrateKeysDb"] [persistLowerCase|
Associated
  key SKey
  file FilePath
  KeyFileIndex key file
  FileKeyIndex file key
Content
  key SKey
  cache SInodeCache
  KeyCacheIndex key cache
|]

containedTable :: TableName
containedTable = "content"

createTables :: SqlPersistM ()
createTables = void $ runMigrationSilent migrateKeysDb

newtype ReadHandle = ReadHandle H.DbQueue

readDb :: SqlPersistM a -> ReadHandle -> IO a
readDb a (ReadHandle h) = H.queryDbQueue h a

newtype WriteHandle = WriteHandle H.DbQueue

queueDb :: SqlPersistM () -> WriteHandle -> IO ()
queueDb a (WriteHandle h) = H.queueDb h checkcommit a
  where
	-- commit queue after 1000 changes or 5 minutes, whichever comes first
	checkcommit sz lastcommittime
		| sz > 1000 = return True
		| otherwise = do
			now <- getCurrentTime
			return $ diffUTCTime lastcommittime now > 300

addAssociatedFile :: SKey -> TopFilePath -> WriteHandle -> IO ()
addAssociatedFile sk f = queueDb $ do
	-- If the same file was associated with a different key before,
	-- remove that.
	delete $ from $ \r -> do
		where_ (r ^. AssociatedFile ==. val (getTopFilePath f) &&. not_ (r ^. AssociatedKey ==. val sk))
	void $ insertUnique $ Associated sk (getTopFilePath f)

{- Note that the files returned were once associated with the key, but
 - some of them may not be any longer. -}
getAssociatedFiles :: SKey -> ReadHandle -> IO [TopFilePath]
getAssociatedFiles sk = readDb $ do
	l <- select $ from $ \r -> do
		where_ (r ^. AssociatedKey ==. val sk)
		return (r ^. AssociatedFile)
	return $ map (asTopFilePath . unValue) l

{- Gets any keys that are on record as having a particular associated file.
 - (Should be one or none but the database doesn't enforce that.) -}
getAssociatedKey :: TopFilePath -> ReadHandle -> IO [SKey]
getAssociatedKey f = readDb $ do
	l <- select $ from $ \r -> do
		where_ (r ^. AssociatedFile ==. val (getTopFilePath f))
		return (r ^. AssociatedKey)
	return $ map unValue l

removeAssociatedFile :: SKey -> TopFilePath -> WriteHandle -> IO ()
removeAssociatedFile sk f = queueDb $ 
	delete $ from $ \r -> do
		where_ (r ^. AssociatedKey ==. val sk &&. r ^. AssociatedFile ==. val (getTopFilePath f))

addInodeCaches :: SKey -> [InodeCache] -> WriteHandle -> IO ()
addInodeCaches sk is = queueDb $
	forM_ is $ \i -> insertUnique $ Content sk (toSInodeCache i)

{- A key may have multiple InodeCaches; one for the annex object, and one
 - for each pointer file that is a copy of it. -}
getInodeCaches :: SKey -> ReadHandle -> IO [InodeCache]
getInodeCaches sk = readDb $ do
	l <- select $ from $ \r -> do
		where_ (r ^. ContentKey ==. val sk)
		return (r ^. ContentCache)
	return $ map (fromSInodeCache . unValue) l

removeInodeCaches :: SKey -> WriteHandle -> IO ()
removeInodeCaches sk = queueDb $ 
	delete $ from $ \r -> do
		where_ (r ^. ContentKey ==. val sk)
-												split out raw sql interface

											
										
										
											2016-01-11 19:52:11 +00:00
+								{- Sqlite database of information about Keys
 								 -
 								 - Copyright 2015-2016 Joey Hess <id@joeyh.name>
 								 -
 								 - Licensed under the GNU GPL version 3 or higher.
 								 -}
 								{-# LANGUAGE QuasiQuotes, TypeFamilies, TemplateHaskell #-}
 								{-# LANGUAGE OverloadedStrings, GADTs, FlexibleContexts #-}
 								{-# LANGUAGE MultiParamTypeClasses, GeneralizedNewtypeDeriving #-}
 								{-# LANGUAGE RankNTypes, ScopedTypeVariables #-}
 								module Database.Keys.SQL where
 								import Database.Types
-												add database benchmark

The benchmark shows that the database access is quite fast indeed!
And, it scales linearly to the number of keys, with one exception,
getAssociatedKey.

Based on this benchmark, I don't think I need worry about optimising
for cases where all files are locked and the database is mostly empty.
In those cases, database access will be misses, and according to this
benchmark, should add only 50 milliseconds to runtime.

(NB: There may be some overhead to getting the database opened and locking
the handle that this benchmark doesn't see.)

joey@darkstar:~/src/git-annex>./git-annex benchmark
setting up database with 1000
setting up database with 10000
benchmarking keys database/getAssociatedFiles from 1000 (hit)
time                 62.77 μs   (62.70 μs .. 62.85 μs)
                     1.000 R²   (1.000 R² .. 1.000 R²)
mean                 62.81 μs   (62.76 μs .. 62.88 μs)
std dev              201.6 ns   (157.5 ns .. 259.5 ns)

benchmarking keys database/getAssociatedFiles from 1000 (miss)
time                 50.02 μs   (49.97 μs .. 50.07 μs)
                     1.000 R²   (1.000 R² .. 1.000 R²)
mean                 50.09 μs   (50.04 μs .. 50.17 μs)
std dev              206.7 ns   (133.8 ns .. 295.3 ns)

benchmarking keys database/getAssociatedKey from 1000 (hit)
time                 211.2 μs   (210.5 μs .. 212.3 μs)
                     1.000 R²   (0.999 R² .. 1.000 R²)
mean                 211.0 μs   (210.7 μs .. 212.0 μs)
std dev              1.685 μs   (334.4 ns .. 3.517 μs)

benchmarking keys database/getAssociatedKey from 1000 (miss)
time                 173.5 μs   (172.7 μs .. 174.2 μs)
                     1.000 R²   (0.999 R² .. 1.000 R²)
mean                 173.7 μs   (173.0 μs .. 175.5 μs)
std dev              3.833 μs   (1.858 μs .. 6.617 μs)
variance introduced by outliers: 16% (moderately inflated)

benchmarking keys database/getAssociatedFiles from 10000 (hit)
time                 64.01 μs   (63.84 μs .. 64.18 μs)
                     1.000 R²   (1.000 R² .. 1.000 R²)
mean                 64.85 μs   (64.34 μs .. 66.02 μs)
std dev              2.433 μs   (547.6 ns .. 4.652 μs)
variance introduced by outliers: 40% (moderately inflated)

benchmarking keys database/getAssociatedFiles from 10000 (miss)
time                 50.33 μs   (50.28 μs .. 50.39 μs)
                     1.000 R²   (1.000 R² .. 1.000 R²)
mean                 50.32 μs   (50.26 μs .. 50.38 μs)
std dev              202.7 ns   (167.6 ns .. 252.0 ns)

benchmarking keys database/getAssociatedKey from 10000 (hit)
time                 1.142 ms   (1.139 ms .. 1.146 ms)
                     1.000 R²   (1.000 R² .. 1.000 R²)
mean                 1.142 ms   (1.140 ms .. 1.144 ms)
std dev              7.142 μs   (4.994 μs .. 10.98 μs)

benchmarking keys database/getAssociatedKey from 10000 (miss)
time                 1.094 ms   (1.092 ms .. 1.096 ms)
                     1.000 R²   (1.000 R² .. 1.000 R²)
mean                 1.095 ms   (1.095 ms .. 1.097 ms)
std dev              4.277 μs   (2.591 μs .. 7.228 μs)

											
										
										
											2016-01-12 17:01:44 +00:00
+								import Database.Handle
-												split out raw sql interface

											
										
										
											2016-01-11 19:52:11 +00:00
+								import qualified Database.Queue as H
 								import Utility.InodeCache
 								import Git.FilePath
 								import Database.Persist.TH
 								import Database.Esqueleto hiding (Key)
 								import Data.Time.Clock
 								import Control.Monad
 								share [mkPersist sqlSettings, mkMigrate "migrateKeysDb"] [persistLowerCase|
 								Associated
 								  key SKey
 								  file FilePath
 								  KeyFileIndex key file
-												add FileKeyIndex to Keys db to optimize getAssociatedKey

This is a schema change so will break any existing keys databases. But,
it's not been released yet, so I'm still able to make such changes.

This speeds up the benchmark quite nicely:

benchmarking keys database/getAssociatedKey from 1000 (hit)
time                 91.65 μs   (91.48 μs .. 91.81 μs)
                     1.000 R²   (1.000 R² .. 1.000 R²)
mean                 91.78 μs   (91.66 μs .. 91.94 μs)
std dev              468.3 ns   (353.1 ns .. 624.3 ns)

benchmarking keys database/getAssociatedKey from 1000 (miss)
time                 53.33 μs   (53.23 μs .. 53.40 μs)
                     1.000 R²   (1.000 R² .. 1.000 R²)
mean                 53.43 μs   (53.36 μs .. 53.53 μs)
std dev              274.2 ns   (211.7 ns .. 361.5 ns)

benchmarking keys database/getAssociatedKey from 10000 (hit)
time                 92.99 μs   (92.74 μs .. 93.27 μs)
                     1.000 R²   (1.000 R² .. 1.000 R²)
mean                 92.90 μs   (92.76 μs .. 93.16 μs)
std dev              608.7 ns   (404.1 ns .. 963.5 ns)

benchmarking keys database/getAssociatedKey from 10000 (miss)
time                 53.12 μs   (52.91 μs .. 53.39 μs)
                     1.000 R²   (0.999 R² .. 1.000 R²)
mean                 52.84 μs   (52.68 μs .. 53.16 μs)
std dev              715.4 ns   (400.4 ns .. 1.370 μs)

											
										
										
											2016-01-12 17:07:14 +00:00
+								  FileKeyIndex file key
-												split out raw sql interface

											
										
										
											2016-01-11 19:52:11 +00:00
+								Content
 								  key SKey
 								  cache SInodeCache
 								  KeyCacheIndex key cache
 								|]
-												add database benchmark

The benchmark shows that the database access is quite fast indeed!
And, it scales linearly to the number of keys, with one exception,
getAssociatedKey.

Based on this benchmark, I don't think I need worry about optimising
for cases where all files are locked and the database is mostly empty.
In those cases, database access will be misses, and according to this
benchmark, should add only 50 milliseconds to runtime.

(NB: There may be some overhead to getting the database opened and locking
the handle that this benchmark doesn't see.)

joey@darkstar:~/src/git-annex>./git-annex benchmark
setting up database with 1000
setting up database with 10000
benchmarking keys database/getAssociatedFiles from 1000 (hit)
time                 62.77 μs   (62.70 μs .. 62.85 μs)
                     1.000 R²   (1.000 R² .. 1.000 R²)
mean                 62.81 μs   (62.76 μs .. 62.88 μs)
std dev              201.6 ns   (157.5 ns .. 259.5 ns)

benchmarking keys database/getAssociatedFiles from 1000 (miss)
time                 50.02 μs   (49.97 μs .. 50.07 μs)
                     1.000 R²   (1.000 R² .. 1.000 R²)
mean                 50.09 μs   (50.04 μs .. 50.17 μs)
std dev              206.7 ns   (133.8 ns .. 295.3 ns)

benchmarking keys database/getAssociatedKey from 1000 (hit)
time                 211.2 μs   (210.5 μs .. 212.3 μs)
                     1.000 R²   (0.999 R² .. 1.000 R²)
mean                 211.0 μs   (210.7 μs .. 212.0 μs)
std dev              1.685 μs   (334.4 ns .. 3.517 μs)

benchmarking keys database/getAssociatedKey from 1000 (miss)
time                 173.5 μs   (172.7 μs .. 174.2 μs)
                     1.000 R²   (0.999 R² .. 1.000 R²)
mean                 173.7 μs   (173.0 μs .. 175.5 μs)
std dev              3.833 μs   (1.858 μs .. 6.617 μs)
variance introduced by outliers: 16% (moderately inflated)

benchmarking keys database/getAssociatedFiles from 10000 (hit)
time                 64.01 μs   (63.84 μs .. 64.18 μs)
                     1.000 R²   (1.000 R² .. 1.000 R²)
mean                 64.85 μs   (64.34 μs .. 66.02 μs)
std dev              2.433 μs   (547.6 ns .. 4.652 μs)
variance introduced by outliers: 40% (moderately inflated)

benchmarking keys database/getAssociatedFiles from 10000 (miss)
time                 50.33 μs   (50.28 μs .. 50.39 μs)
                     1.000 R²   (1.000 R² .. 1.000 R²)
mean                 50.32 μs   (50.26 μs .. 50.38 μs)
std dev              202.7 ns   (167.6 ns .. 252.0 ns)

benchmarking keys database/getAssociatedKey from 10000 (hit)
time                 1.142 ms   (1.139 ms .. 1.146 ms)
                     1.000 R²   (1.000 R² .. 1.000 R²)
mean                 1.142 ms   (1.140 ms .. 1.144 ms)
std dev              7.142 μs   (4.994 μs .. 10.98 μs)

benchmarking keys database/getAssociatedKey from 10000 (miss)
time                 1.094 ms   (1.092 ms .. 1.096 ms)
                     1.000 R²   (1.000 R² .. 1.000 R²)
mean                 1.095 ms   (1.095 ms .. 1.097 ms)
std dev              4.277 μs   (2.591 μs .. 7.228 μs)

											
										
										
											2016-01-12 17:01:44 +00:00
+								containedTable :: TableName
 								containedTable = "content"
 								createTables :: SqlPersistM ()
 								createTables = void $ runMigrationSilent migrateKeysDb
-												split out raw sql interface

											
										
										
											2016-01-11 19:52:11 +00:00
+								newtype ReadHandle = ReadHandle H.DbQueue
 								readDb :: SqlPersistM a -> ReadHandle -> IO a
 								readDb a (ReadHandle h) = H.queryDbQueue h a
 								newtype WriteHandle = WriteHandle H.DbQueue
 								queueDb :: SqlPersistM () -> WriteHandle -> IO ()
 								queueDb a (WriteHandle h) = H.queueDb h checkcommit a
 								  where
 									-- commit queue after 1000 changes or 5 minutes, whichever comes first
 									checkcommit sz lastcommittime
 										| sz > 1000 = return True
 										| otherwise = do
 											now <- getCurrentTime
 											return $ diffUTCTime lastcommittime now > 300
 								addAssociatedFile :: SKey -> TopFilePath -> WriteHandle -> IO ()
 								addAssociatedFile sk f = queueDb $ do
 									-- If the same file was associated with a different key before,
 									-- remove that.
 									delete $ from $ \r -> do
 										where_ (r ^. AssociatedFile ==. val (getTopFilePath f) &&. not_ (r ^. AssociatedKey ==. val sk))
 									void $ insertUnique $ Associated sk (getTopFilePath f)
 								{- Note that the files returned were once associated with the key, but
 								 - some of them may not be any longer. -}
 								getAssociatedFiles :: SKey -> ReadHandle -> IO [TopFilePath]
 								getAssociatedFiles sk = readDb $ do
 									l <- select $ from $ \r -> do
 										where_ (r ^. AssociatedKey ==. val sk)
 										return (r ^. AssociatedFile)
 									return $ map (asTopFilePath . unValue) l
 								{- Gets any keys that are on record as having a particular associated file.
 								 - (Should be one or none but the database doesn't enforce that.) -}
 								getAssociatedKey :: TopFilePath -> ReadHandle -> IO [SKey]
 								getAssociatedKey f = readDb $ do
 									l <- select $ from $ \r -> do
 										where_ (r ^. AssociatedFile ==. val (getTopFilePath f))
 										return (r ^. AssociatedKey)
 									return $ map unValue l
 								removeAssociatedFile :: SKey -> TopFilePath -> WriteHandle -> IO ()
 								removeAssociatedFile sk f = queueDb $
 									delete $ from $ \r -> do
 										where_ (r ^. AssociatedKey ==. val sk &&. r ^. AssociatedFile ==. val (getTopFilePath f))
 								addInodeCaches :: SKey -> [InodeCache] -> WriteHandle -> IO ()
 								addInodeCaches sk is = queueDb $
 									forM_ is $ \i -> insertUnique $ Content sk (toSInodeCache i)
 								{- A key may have multiple InodeCaches; one for the annex object, and one
 								 - for each pointer file that is a copy of it. -}
 								getInodeCaches :: SKey -> ReadHandle -> IO [InodeCache]
 								getInodeCaches sk = readDb $ do
 									l <- select $ from $ \r -> do
 										where_ (r ^. ContentKey ==. val sk)
 										return (r ^. ContentCache)
 									return $ map (fromSInodeCache . unValue) l
 								removeInodeCaches :: SKey -> WriteHandle -> IO ()
 								removeInodeCaches sk = queueDb $
 									delete $ from $ \r -> do
 										where_ (r ^. ContentKey ==. val sk)