diff --git a/doc/todo/sqlite_database_improvements.mdwn b/doc/todo/sqlite_database_improvements.mdwn index d4c183dae0..0458c3f7c4 100644 --- a/doc/todo/sqlite_database_improvements.mdwn +++ b/doc/todo/sqlite_database_improvements.mdwn @@ -22,6 +22,171 @@ process. around with a hack that escapes FilePaths that contain unusual characters. It would be much better to use a BLOB. + Also, when LANG=C is sometimes used, the hack can result in duplicates with + different representations of the same filename, like this: + + INSERT INTO associated VALUES(4,'SHA256E-s30--7d51d2454391a40e952bea478e45d64cf0d606e1e8c0652bb815a22e0e23419a,'foo.ü'); + INSERT INTO associated VALUES(5,'SHA256E-s30--7d51d2454391a40e952bea478e45d64cf0d606e1e8c0652bb815a22e0e23419a','"foo.\56515\56508"'); + * IKey could fail to round-trip as well, when a Key contains something (eg, a filename extension) that is not valid in the current locale, for similar reasons to SFilePath. Using BLOB would be better. + + See [[!commit cf260d9a159050e2a7e70394fdd8db289c805ec3]] for how the + encoding problem was fixed for SFilePath. I reproduced a similar problem + by making a file `foo.ü` and running `git add` on it in a unicode + locale. Then with LANG=C, `git annex drop --force foo.ü` thinks + it drops the content, but in fact the work tree file is left containing + the dropped content. The database then contained: + + INSERT INTO associated VALUES(8,'SHA256E-s30--59594eea8d6f64156b3ce6530cc3a661739abf2a0b72443de8683c34b0b19344.ü','foo.ü'); + INSERT INTO associated VALUES(9,'SHA256E-s30--59594eea8d6f64156b3ce6530cc3a661739abf2a0b72443de8683c34b0b19344.��','"foo.\56515\56508"'); + +> Investigated this in more detail, and I can't find a way to +> solve the encoding problem other than changing the encoding +> SKey, IKey, and SFilePath in a non-backwards-compatible way. +> +> (Unless the encoding problem is related to persistent's use of Text +> internally, and could then perhaps be avoided by avoiding that?) +> +> The simplest and best final result would be use a ByteString +> for all of them, and store a blob in sqlite. Attached patch +> shows how to do that, but old git-annex won't be able to read +> the updated databases, and won't know that it can't read them! +> +> This seems to call for a flag day, throwing out the old database +> contents and regenerating them from other data: +> +> * Fsck (SKey) +> can't rebuild? Just drop and let incremental fscks re-do work +> * ContentIdentifier (IKey) +> rebuild with updateFromLog, would need to diff from empty tree to +> current git-annex branch, may be expensive to do! +> * Export (IKey, SFilePath) +> difficult to rebuild, what if in the middle of an interrupted +> export? +> +> updateExportTreeFromLog only updates two tables, not others +> +> Conceptually, this is the same as the repo being lost and another +> clone being used to update the export. The clone can only learn +> export state from the log. It's supposed to recover from such +> situations, the next time an export is run, so should be ok. +> But it might result in already exported files being re-uploaded, +> or other unncessary work. +> Keys (IKey, SFilePath) +> rebuild with scanUnlockedFiles +> +> does that update the Content table with the InodeCache? +> +> But after such a transition, how to communicate to the old git-annex +> that it can't use the databases any longer? Moving the databases +> out of the way won't do; old git-annex will just recreate them and +> start with missing data! +> +> And, what about users who really need to continue using an old git-annex +> and get bitten by the flag day? +> +> Should this instead be a annex.version bump from v7 to v8? +> But v5 is also affected for ContentIdentifier and Export and Fsck. +> Don't want v5.1. + +---- + +[[!format patch """ +diff --git a/Database/Types.hs b/Database/Types.hs +index f08cf4e9d..3e9c9e267 100644 +--- a/Database/Types.hs ++++ b/Database/Types.hs +@@ -14,11 +14,12 @@ import Database.Persist.TH + import Database.Persist.Class hiding (Key) + import Database.Persist.Sql hiding (Key) + import Data.Maybe +-import Data.Char + import qualified Data.ByteString as S ++import qualified Data.ByteString.Lazy as L + import qualified Data.Text as T + + import Utility.PartialPrelude ++import Utility.FileSystemEncoding + import Key + import Utility.InodeCache + import Git.Types (Ref(..)) +@@ -37,23 +38,18 @@ fromSKey (SKey s) = fromMaybe (error $ "bad serialized Key " ++ s) (deserializeK + + derivePersistField "SKey" + +--- A Key index. More efficient than SKey, but its Read instance does not +--- work when it's used in any kind of complex data structure. +-newtype IKey = IKey String +- +-instance Read IKey where +- readsPrec _ s = [(IKey s, "")] +- +-instance Show IKey where +- show (IKey s) = s ++-- A Key index. More efficient than SKey. ++newtype IKey = IKey S.ByteString ++ deriving (Eq, Show, PersistField, PersistFieldSql) + ++-- FIXME: toStrict copies, not efficient + toIKey :: Key -> IKey +-toIKey = IKey . serializeKey ++toIKey = IKey . L.toStrict . serializeKey' + + fromIKey :: IKey -> Key +-fromIKey (IKey s) = fromMaybe (error $ "bad serialized Key " ++ s) (deserializeKey s) +- +-derivePersistField "IKey" ++fromIKey (IKey b) = fromMaybe ++ (error $ "bad serialized Key " ++ show b) ++ (deserializeKey' b) + + -- A serialized InodeCache + newtype SInodeCache = I String +@@ -67,39 +63,15 @@ fromSInodeCache (I s) = fromMaybe (error $ "bad serialized InodeCache " ++ s) (r + + derivePersistField "SInodeCache" + +--- A serialized FilePath. +--- +--- Not all unicode characters round-trip through sqlite. In particular, +--- surrigate code points do not. So, escape the FilePath. But, only when +--- it contains such characters. +-newtype SFilePath = SFilePath String +- +--- Note that Read instance does not work when used in any kind of complex +--- data structure. +-instance Read SFilePath where +- readsPrec _ s = [(SFilePath s, "")] +- +-instance Show SFilePath where +- show (SFilePath s) = s ++-- A serialized FilePath. Stored as a ByteString to avoid encoding problems. ++newtype SFilePath = SFilePath S.ByteString ++ deriving (Eq, Show, PersistField, PersistFieldSql) + + toSFilePath :: FilePath -> SFilePath +-toSFilePath s@('"':_) = SFilePath (show s) +-toSFilePath s +- | any needsescape s = SFilePath (show s) +- | otherwise = SFilePath s +- where +- needsescape c = case generalCategory c of +- Surrogate -> True +- PrivateUse -> True +- NotAssigned -> True +- _ -> False ++toSFilePath = SFilePath . encodeBS + + fromSFilePath :: SFilePath -> FilePath +-fromSFilePath (SFilePath s@('"':_)) = +- fromMaybe (error "bad serialized SFilePath " ++ s) (readish s) +-fromSFilePath (SFilePath s) = s +- +-derivePersistField "SFilePath" ++fromSFilePath (SFilePath b) = decodeBS b + + -- A serialized Ref + newtype SRef = SRef Ref +"""]]