horrible impementation of isInodeKnown

The only good thing about it is it does not require a major version bump to improve the database. That will need to happen at some point though. Potentially very very slow in a large repository. Ugly use of raw sql.
2019-10-23 14:06:11 -04:00 · 2019-10-23 14:06:11 -04:00 · 94efc400e9
commit 94efc400e9
parent eebf080b33
4 changed files with 65 additions and 2 deletions
--- a/Database/Keys.hs
+++ b/Database/Keys.hs
@ -1,6 +1,6 @@
 {- Sqlite database of information about Keys
 -
- - Copyright 2015-2018 Joey Hess <id@joeyh.name>
+ - Copyright 2015-2019 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU AGPL version 3 or higher.
 -}
@ -19,6 +19,7 @@ module Database.Keys (
 	addInodeCaches,
 	getInodeCaches,
 	removeInodeCaches,
+	isInodeKnown,
 	runWriter,
 ) where

@ -187,6 +188,9 @@ getInodeCaches = runReaderIO . SQL.getInodeCaches . toIKey
 removeInodeCaches :: Key -> Annex ()
 removeInodeCaches = runWriterIO . SQL.removeInodeCaches . toIKey

+isInodeKnown :: InodeCache -> SentinalStatus -> Annex Bool
+isInodeKnown i s = or <$> runReaderIO ((:[]) <$$> SQL.isInodeKnown i s)
+
 {- Looks at staged changes to find when unlocked files are copied/moved,
 - and updates associated files in the keys database.
 -
--- a/Database/Keys/SQL.hs
+++ b/Database/Keys/SQL.hs
@ -1,6 +1,6 @@
 {- Sqlite database of information about Keys
 -
- - Copyright 2015-2016 Joey Hess <id@joeyh.name>
+ - Copyright 2015-2019 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU AGPL version 3 or higher.
 -}
@ -23,6 +23,9 @@ import Database.Persist.Sql
 import Database.Persist.TH
 import Data.Time.Clock
 import Control.Monad
+import Data.Maybe
+import qualified Data.Text as T
+import qualified Data.Conduit.List as CL

 share [mkPersist sqlSettings, mkMigrate "migrateKeysDb"] [persistLowerCase|
 Associated
@ -116,3 +119,31 @@ getInodeCaches ik = readDb $ do
 removeInodeCaches :: IKey -> WriteHandle -> IO ()
 removeInodeCaches ik = queueDb $
 	deleteWhere [ContentKey ==. ik]
+
+{- Check if the inode is known to be used for an annexed file.
+ -
+ - This is currently slow due to the lack of indexes.
+ -}
+isInodeKnown :: InodeCache -> SentinalStatus -> ReadHandle -> IO Bool
+isInodeKnown i s = readDb query
+  where
+	query 
+		| sentinalInodesChanged s =
+			withRawQuery likesql [] $ isJust <$> CL.head
+		| otherwise =
+			isJust <$> selectFirst [ContentCache ==. si] []
+	
+	si = toSInodeCache i
+			
+	likesql = T.concat
+		[ "SELECT key FROM content WHERE "
+		, T.unwords (map mklike (likeInodeCacheWeak i))
+		, " LIMIT 1"
+		]
+
+	mklike p = T.concat
+		[ "cache LIKE "
+		, "'I \"" -- SInodeCache serializes as I "..."
+		, T.pack p
+		, "\"'"
+		]
--- a/Utility/InodeCache.hs
+++ b/Utility/InodeCache.hs
@ -23,6 +23,7 @@ module Utility.InodeCache (
 	showInodeCache,
 	genInodeCache,
 	toInodeCache,
+	likeInodeCacheWeak,

 	InodeCacheKey,
 	inodeCacheToKey,
@ -149,6 +150,22 @@ showInodeCache (InodeCache (InodeCachePrim inode size (MTimeLowRes mtime))) =
 		, show mtime
 		]

+-- Generates patterns that can be used in a SQL LIKE query to match
+-- serialized inode caches that are weakly the same as the provided
+-- InodeCache.
+--
+-- Like compareWeak, the size has to match, while the mtime can differ
+-- by anything less than 2 seconds.
+likeInodeCacheWeak :: InodeCache -> [String]
+likeInodeCacheWeak (InodeCache (InodeCachePrim _ size mtime)) =
+	lowresl ++ highresl
+  where
+	lowresl = map mkpat [t, t+1, t-1]
+	highresl = map (++ " %") lowresl
+	t = lowResTime mtime
+	mkpat t' = "% " ++ ssz ++ " " ++ show t'
+ 	ssz = show size
+
 readInodeCache :: String -> Maybe InodeCache
 readInodeCache s = case words s of
 	(inode:size:mtime:[]) -> do
--- a/doc/todo/sqlite_database_improvements.mdwn
+++ b/doc/todo/sqlite_database_improvements.mdwn
@ -2,6 +2,17 @@ Collection of non-ideal things about git-annex's use of sqlite databases.
 Would be good to improve these sometime, but it would need a migration
 process.

+* Database.Keys.SQL.isInodeKnown seems likely to get very slow
+  when there are a lot of unlocked annexed files. It needs
+  an index in the database, eg "InodeIndex cache"
+
+  It also has to do some really ugly SQL LIKE queries. Probably an index
+  would not speed them up. They're only needed when git-annex detects
+  inodes are not stable, eg on fat or probably windows. A better database
+  schema should be able to eliminate the need for those LIKE queries.
+  Eg, store the size and allowable mtimes in a separate table that is
+  queried when necessary.
+
 * Database.Export.getExportedKey would be faster if there was an index
  in the database, eg "ExportedIndex file key". This only affects
  the speed of `git annex export`, which is probably swamped by the actual