From 09c7cbbaa8714626813c5aaa38927c541d0d6453 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Wed, 30 Oct 2019 13:50:16 -0400 Subject: [PATCH] update for things already fixed in this branch --- CHANGELOG | 4 + .../assistant_crashes_in_TransferScanner.mdwn | 4 + doc/todo/sqlite_database_improvements.mdwn | 114 +++--------------- 3 files changed, 22 insertions(+), 100 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 22c69752e1..630cf05016 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,10 @@ git-annex (7.20191025) UNRELEASED; urgency=medium * benchmark: Add --databases to benchmark sqlite databases. + * Add some missing indexes to sqlite databases. This will speed up + some things involving export and import remotes, and git-annex smudge. + * Improved serialization of filenames and keys to the sqlite databases, + avoiding some encoding problems. -- Joey Hess Tue, 29 Oct 2019 15:13:03 -0400 diff --git a/doc/bugs/assistant_crashes_in_TransferScanner.mdwn b/doc/bugs/assistant_crashes_in_TransferScanner.mdwn index 53147bcef9..175a87c398 100644 --- a/doc/bugs/assistant_crashes_in_TransferScanner.mdwn +++ b/doc/bugs/assistant_crashes_in_TransferScanner.mdwn @@ -32,3 +32,7 @@ TransferScanner crashed: fd:47: commitBuffer: invalid argument (invalid characte """]] [[!meta title="v6 repository problems with filename encoding when not in unicode locale"]] + +> This bug grew to encompass several problems. I believe all of them are +> now [[fixed|done]]. Please reopen a new bug report if you see a problem like this +> with current git-annex. --[[Joey]] diff --git a/doc/todo/sqlite_database_improvements.mdwn b/doc/todo/sqlite_database_improvements.mdwn index eaf1bb9836..8f1a5208b0 100644 --- a/doc/todo/sqlite_database_improvements.mdwn +++ b/doc/todo/sqlite_database_improvements.mdwn @@ -10,12 +10,18 @@ process. Eg, store the size and allowable mtimes in a separate table that is queried when necessary. +* Several selects were not able to use indexes, so would be slow. + + Fixed by adding indexes. + * Database.Types has some suboptimal encodings for Key and InodeCache. They are both slow due to being implemented using String (which may be fixable w/o changing the DB schema), and the VARCHARs they generate are longer than necessary since they look like eg `SKey "whatever"` and `I "whatever"` + Fixed. + * SFilePath is stored efficiently, and has to be a String anyway, (until ByteStringFilePath is used) but since it's stored as a VARCHAR, which sqlite interprets using the @@ -35,6 +41,8 @@ process. And it seems likely that a query by filename would fail if the filename was in the database but with a different encoding. + Fixed by converting to blob. + * IKey could fail to round-trip as well, when a Key contains something (eg, a filename extension) that is not valid in the current locale, for similar reasons to SFilePath. Using BLOB would be better. @@ -49,6 +57,12 @@ process. INSERT INTO associated VALUES(8,'SHA256E-s30--59594eea8d6f64156b3ce6530cc3a661739abf2a0b72443de8683c34b0b19344.ü','foo.ü'); INSERT INTO associated VALUES(9,'SHA256E-s30--59594eea8d6f64156b3ce6530cc3a661739abf2a0b72443de8683c34b0b19344.��','"foo.\56515\56508"'); + Fixed by converting to blob. + +---- + +migration + > Investigated this in more detail, and I can't find a way to > solve the encoding problem other than changing the encoding > SKey, IKey, and SFilePath in a non-backwards-compatible way. @@ -102,103 +116,3 @@ process. > And, what about users who use a mix of old and new git-annex versions? > > Seems this needs an annex.version bump from v7 to v8. - ----- - -[[!format patch """ -diff --git a/Database/Types.hs b/Database/Types.hs -index f08cf4e9d..3e9c9e267 100644 ---- a/Database/Types.hs -+++ b/Database/Types.hs -@@ -14,11 +14,12 @@ import Database.Persist.TH - import Database.Persist.Class hiding (Key) - import Database.Persist.Sql hiding (Key) - import Data.Maybe --import Data.Char - import qualified Data.ByteString as S -+import qualified Data.ByteString.Lazy as L - import qualified Data.Text as T - - import Utility.PartialPrelude -+import Utility.FileSystemEncoding - import Key - import Utility.InodeCache - import Git.Types (Ref(..)) -@@ -37,23 +38,18 @@ fromSKey (SKey s) = fromMaybe (error $ "bad serialized Key " ++ s) (deserializeK - - derivePersistField "SKey" - ---- A Key index. More efficient than SKey, but its Read instance does not ---- work when it's used in any kind of complex data structure. --newtype IKey = IKey String -- --instance Read IKey where -- readsPrec _ s = [(IKey s, "")] -- --instance Show IKey where -- show (IKey s) = s -+-- A Key index. More efficient than SKey. -+newtype IKey = IKey S.ByteString -+ deriving (Eq, Show, PersistField, PersistFieldSql) - -+-- FIXME: toStrict copies, not efficient - toIKey :: Key -> IKey --toIKey = IKey . serializeKey -+toIKey = IKey . L.toStrict . serializeKey' - - fromIKey :: IKey -> Key --fromIKey (IKey s) = fromMaybe (error $ "bad serialized Key " ++ s) (deserializeKey s) -- --derivePersistField "IKey" -+fromIKey (IKey b) = fromMaybe -+ (error $ "bad serialized Key " ++ show b) -+ (deserializeKey' b) - - -- A serialized InodeCache - newtype SInodeCache = I String -@@ -67,39 +63,15 @@ fromSInodeCache (I s) = fromMaybe (error $ "bad serialized InodeCache " ++ s) (r - - derivePersistField "SInodeCache" - ---- A serialized FilePath. ---- ---- Not all unicode characters round-trip through sqlite. In particular, ---- surrigate code points do not. So, escape the FilePath. But, only when ---- it contains such characters. --newtype SFilePath = SFilePath String -- ---- Note that Read instance does not work when used in any kind of complex ---- data structure. --instance Read SFilePath where -- readsPrec _ s = [(SFilePath s, "")] -- --instance Show SFilePath where -- show (SFilePath s) = s -+-- A serialized FilePath. Stored as a ByteString to avoid encoding problems. -+newtype SFilePath = SFilePath S.ByteString -+ deriving (Eq, Show, PersistField, PersistFieldSql) - - toSFilePath :: FilePath -> SFilePath --toSFilePath s@('"':_) = SFilePath (show s) --toSFilePath s -- | any needsescape s = SFilePath (show s) -- | otherwise = SFilePath s -- where -- needsescape c = case generalCategory c of -- Surrogate -> True -- PrivateUse -> True -- NotAssigned -> True -- _ -> False -+toSFilePath = SFilePath . encodeBS - - fromSFilePath :: SFilePath -> FilePath --fromSFilePath (SFilePath s@('"':_)) = -- fromMaybe (error "bad serialized SFilePath " ++ s) (readish s) --fromSFilePath (SFilePath s) = s -- --derivePersistField "SFilePath" -+fromSFilePath (SFilePath b) = decodeBS b - - -- A serialized Ref - newtype SRef = SRef Ref -"""]]