horrible impementation of isInodeKnown

The only good thing about it is it does not require a major version bump to improve the database. That will need to happen at some point though. Potentially very very slow in a large repository. Ugly use of raw sql.
2019-10-23 14:06:11 -04:00 · 2019-10-23 14:06:11 -04:00 · 94efc400e9
commit 94efc400e9
parent eebf080b33
4 changed files with 65 additions and 2 deletions
--- a/Database/Keys.hs
+++ b/Database/Keys.hs
@ -1,6 +1,6 @@
 {- Sqlite database of information about Keys
 -
- - Copyright 2015-2018 Joey Hess <id@joeyh.name>
+ - Copyright 2015-2019 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU AGPL version 3 or higher.
 -}
@ -19,6 +19,7 @@ module Database.Keys (
 	addInodeCaches,
 	getInodeCaches,
 	removeInodeCaches,
 	isInodeKnown,
 	runWriter,
 ) where
@ -187,6 +188,9 @@ getInodeCaches = runReaderIO . SQL.getInodeCaches . toIKey
 removeInodeCaches :: Key -> Annex ()
 removeInodeCaches = runWriterIO . SQL.removeInodeCaches . toIKey
 isInodeKnown :: InodeCache -> SentinalStatus -> Annex Bool
 isInodeKnown i s = or <$> runReaderIO ((:[]) <$$> SQL.isInodeKnown i s)
 {- Looks at staged changes to find when unlocked files are copied/moved,
 - and updates associated files in the keys database.
 -
--- a/Database/Keys/SQL.hs
+++ b/Database/Keys/SQL.hs
@ -1,6 +1,6 @@
 {- Sqlite database of information about Keys
 -
- - Copyright 2015-2016 Joey Hess <id@joeyh.name>
+ - Copyright 2015-2019 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU AGPL version 3 or higher.
 -}
@ -23,6 +23,9 @@ import Database.Persist.Sql
 import Database.Persist.TH
 import Data.Time.Clock
 import Control.Monad
 import Data.Maybe
 import qualified Data.Text as T
 import qualified Data.Conduit.List as CL
 share [mkPersist sqlSettings, mkMigrate "migrateKeysDb"] [persistLowerCase|
 Associated
@ -116,3 +119,31 @@ getInodeCaches ik = readDb $ do
 removeInodeCaches :: IKey -> WriteHandle -> IO ()
 removeInodeCaches ik = queueDb $
 	deleteWhere [ContentKey ==. ik]
 {- Check if the inode is known to be used for an annexed file.
 -
 - This is currently slow due to the lack of indexes.
 -}
 isInodeKnown :: InodeCache -> SentinalStatus -> ReadHandle -> IO Bool
 isInodeKnown i s = readDb query
  where
 	query 
 		| sentinalInodesChanged s =
 			withRawQuery likesql [] $ isJust <$> CL.head
 		| otherwise =
 			isJust <$> selectFirst [ContentCache ==. si] []
 	si = toSInodeCache i
 	likesql = T.concat
 		[ "SELECT key FROM content WHERE "
 		, T.unwords (map mklike (likeInodeCacheWeak i))
 		, " LIMIT 1"
 		]
 	mklike p = T.concat
 		[ "cache LIKE "
 		, "'I \"" -- SInodeCache serializes as I "..."
 		, T.pack p
 		, "\"'"
 		]
--- a/Utility/InodeCache.hs
+++ b/Utility/InodeCache.hs
@ -23,6 +23,7 @@ module Utility.InodeCache (
 	showInodeCache,
 	genInodeCache,
 	toInodeCache,
 	likeInodeCacheWeak,
 	InodeCacheKey,
 	inodeCacheToKey,
@ -149,6 +150,22 @@ showInodeCache (InodeCache (InodeCachePrim inode size (MTimeLowRes mtime))) =
 		, show mtime
 		]
 -- Generates patterns that can be used in a SQL LIKE query to match
 -- serialized inode caches that are weakly the same as the provided
 -- InodeCache.
 --
 -- Like compareWeak, the size has to match, while the mtime can differ
 -- by anything less than 2 seconds.
 likeInodeCacheWeak :: InodeCache -> [String]
 likeInodeCacheWeak (InodeCache (InodeCachePrim _ size mtime)) =
 	lowresl ++ highresl
  where
 	lowresl = map mkpat [t, t+1, t-1]
 	highresl = map (++ " %") lowresl
 	t = lowResTime mtime
 	mkpat t' = "% " ++ ssz ++ " " ++ show t'
 	ssz = show size
 readInodeCache :: String -> Maybe InodeCache
 readInodeCache s = case words s of
 	(inode:size:mtime:[]) -> do
--- a/doc/todo/sqlite_database_improvements.mdwn
+++ b/doc/todo/sqlite_database_improvements.mdwn
@ -2,6 +2,17 @@ Collection of non-ideal things about git-annex's use of sqlite databases.
 Would be good to improve these sometime, but it would need a migration
 process.
 * Database.Keys.SQL.isInodeKnown seems likely to get very slow
  when there are a lot of unlocked annexed files. It needs
  an index in the database, eg "InodeIndex cache"
  It also has to do some really ugly SQL LIKE queries. Probably an index
  would not speed them up. They're only needed when git-annex detects
  inodes are not stable, eg on fat or probably windows. A better database
  schema should be able to eliminate the need for those LIKE queries.
  Eg, store the size and allowable mtimes in a separate table that is
  queried when necessary.
 * Database.Export.getExportedKey would be faster if there was an index
  in the database, eg "ExportedIndex file key". This only affects
  the speed of `git annex export`, which is probably swamped by the actual