update for things already fixed in this branch

This commit is contained in:
Joey Hess 2019-10-30 13:50:16 -04:00
parent 9085a2cfec
commit 09c7cbbaa8
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 22 additions and 100 deletions

View file

@ -1,6 +1,10 @@
git-annex (7.20191025) UNRELEASED; urgency=medium git-annex (7.20191025) UNRELEASED; urgency=medium
* benchmark: Add --databases to benchmark sqlite databases. * benchmark: Add --databases to benchmark sqlite databases.
* Add some missing indexes to sqlite databases. This will speed up
some things involving export and import remotes, and git-annex smudge.
* Improved serialization of filenames and keys to the sqlite databases,
avoiding some encoding problems.
-- Joey Hess <id@joeyh.name> Tue, 29 Oct 2019 15:13:03 -0400 -- Joey Hess <id@joeyh.name> Tue, 29 Oct 2019 15:13:03 -0400

View file

@ -32,3 +32,7 @@ TransferScanner crashed: fd:47: commitBuffer: invalid argument (invalid characte
"""]] """]]
[[!meta title="v6 repository problems with filename encoding when not in unicode locale"]] [[!meta title="v6 repository problems with filename encoding when not in unicode locale"]]
> This bug grew to encompass several problems. I believe all of them are
> now [[fixed|done]]. Please reopen a new bug report if you see a problem like this
> with current git-annex. --[[Joey]]

View file

@ -10,12 +10,18 @@ process.
Eg, store the size and allowable mtimes in a separate table that is Eg, store the size and allowable mtimes in a separate table that is
queried when necessary. queried when necessary.
* Several selects were not able to use indexes, so would be slow.
Fixed by adding indexes.
* Database.Types has some suboptimal encodings for Key and InodeCache. * Database.Types has some suboptimal encodings for Key and InodeCache.
They are both slow due to being implemented using String They are both slow due to being implemented using String
(which may be fixable w/o changing the DB schema), (which may be fixable w/o changing the DB schema),
and the VARCHARs they generate are longer than necessary and the VARCHARs they generate are longer than necessary
since they look like eg `SKey "whatever"` and `I "whatever"` since they look like eg `SKey "whatever"` and `I "whatever"`
Fixed.
* SFilePath is stored efficiently, and has to be a String anyway, * SFilePath is stored efficiently, and has to be a String anyway,
(until ByteStringFilePath is used) (until ByteStringFilePath is used)
but since it's stored as a VARCHAR, which sqlite interprets using the but since it's stored as a VARCHAR, which sqlite interprets using the
@ -35,6 +41,8 @@ process.
And it seems likely that a query by filename would fail if the filename And it seems likely that a query by filename would fail if the filename
was in the database but with a different encoding. was in the database but with a different encoding.
Fixed by converting to blob.
* IKey could fail to round-trip as well, when a Key contains something * IKey could fail to round-trip as well, when a Key contains something
(eg, a filename extension) that is not valid in the current locale, (eg, a filename extension) that is not valid in the current locale,
for similar reasons to SFilePath. Using BLOB would be better. for similar reasons to SFilePath. Using BLOB would be better.
@ -49,6 +57,12 @@ process.
INSERT INTO associated VALUES(8,'SHA256E-s30--59594eea8d6f64156b3ce6530cc3a661739abf2a0b72443de8683c34b0b19344.ü','foo.ü'); INSERT INTO associated VALUES(8,'SHA256E-s30--59594eea8d6f64156b3ce6530cc3a661739abf2a0b72443de8683c34b0b19344.ü','foo.ü');
INSERT INTO associated VALUES(9,'SHA256E-s30--59594eea8d6f64156b3ce6530cc3a661739abf2a0b72443de8683c34b0b19344.<2E><>','"foo.\56515\56508"'); INSERT INTO associated VALUES(9,'SHA256E-s30--59594eea8d6f64156b3ce6530cc3a661739abf2a0b72443de8683c34b0b19344.<2E><>','"foo.\56515\56508"');
Fixed by converting to blob.
----
migration
> Investigated this in more detail, and I can't find a way to > Investigated this in more detail, and I can't find a way to
> solve the encoding problem other than changing the encoding > solve the encoding problem other than changing the encoding
> SKey, IKey, and SFilePath in a non-backwards-compatible way. > SKey, IKey, and SFilePath in a non-backwards-compatible way.
@ -102,103 +116,3 @@ process.
> And, what about users who use a mix of old and new git-annex versions? > And, what about users who use a mix of old and new git-annex versions?
> >
> Seems this needs an annex.version bump from v7 to v8. > Seems this needs an annex.version bump from v7 to v8.
----
[[!format patch """
diff --git a/Database/Types.hs b/Database/Types.hs
index f08cf4e9d..3e9c9e267 100644
--- a/Database/Types.hs
+++ b/Database/Types.hs
@@ -14,11 +14,12 @@ import Database.Persist.TH
import Database.Persist.Class hiding (Key)
import Database.Persist.Sql hiding (Key)
import Data.Maybe
-import Data.Char
import qualified Data.ByteString as S
+import qualified Data.ByteString.Lazy as L
import qualified Data.Text as T
import Utility.PartialPrelude
+import Utility.FileSystemEncoding
import Key
import Utility.InodeCache
import Git.Types (Ref(..))
@@ -37,23 +38,18 @@ fromSKey (SKey s) = fromMaybe (error $ "bad serialized Key " ++ s) (deserializeK
derivePersistField "SKey"
--- A Key index. More efficient than SKey, but its Read instance does not
--- work when it's used in any kind of complex data structure.
-newtype IKey = IKey String
-
-instance Read IKey where
- readsPrec _ s = [(IKey s, "")]
-
-instance Show IKey where
- show (IKey s) = s
+-- A Key index. More efficient than SKey.
+newtype IKey = IKey S.ByteString
+ deriving (Eq, Show, PersistField, PersistFieldSql)
+-- FIXME: toStrict copies, not efficient
toIKey :: Key -> IKey
-toIKey = IKey . serializeKey
+toIKey = IKey . L.toStrict . serializeKey'
fromIKey :: IKey -> Key
-fromIKey (IKey s) = fromMaybe (error $ "bad serialized Key " ++ s) (deserializeKey s)
-
-derivePersistField "IKey"
+fromIKey (IKey b) = fromMaybe
+ (error $ "bad serialized Key " ++ show b)
+ (deserializeKey' b)
-- A serialized InodeCache
newtype SInodeCache = I String
@@ -67,39 +63,15 @@ fromSInodeCache (I s) = fromMaybe (error $ "bad serialized InodeCache " ++ s) (r
derivePersistField "SInodeCache"
--- A serialized FilePath.
---
--- Not all unicode characters round-trip through sqlite. In particular,
--- surrigate code points do not. So, escape the FilePath. But, only when
--- it contains such characters.
-newtype SFilePath = SFilePath String
-
--- Note that Read instance does not work when used in any kind of complex
--- data structure.
-instance Read SFilePath where
- readsPrec _ s = [(SFilePath s, "")]
-
-instance Show SFilePath where
- show (SFilePath s) = s
+-- A serialized FilePath. Stored as a ByteString to avoid encoding problems.
+newtype SFilePath = SFilePath S.ByteString
+ deriving (Eq, Show, PersistField, PersistFieldSql)
toSFilePath :: FilePath -> SFilePath
-toSFilePath s@('"':_) = SFilePath (show s)
-toSFilePath s
- | any needsescape s = SFilePath (show s)
- | otherwise = SFilePath s
- where
- needsescape c = case generalCategory c of
- Surrogate -> True
- PrivateUse -> True
- NotAssigned -> True
- _ -> False
+toSFilePath = SFilePath . encodeBS
fromSFilePath :: SFilePath -> FilePath
-fromSFilePath (SFilePath s@('"':_)) =
- fromMaybe (error "bad serialized SFilePath " ++ s) (readish s)
-fromSFilePath (SFilePath s) = s
-
-derivePersistField "SFilePath"
+fromSFilePath (SFilePath b) = decodeBS b
-- A serialized Ref
newtype SRef = SRef Ref
"""]]