switch MetaValue to ByteString and MetaField to Text
MetaField was already limited to alphanumerics, so it makes sense to use Text for it. Note that technically a UUID can contain invalid UTF-8, and so remoteMetaDataPrefix's use of T.pack . fromUUID could replace non-UTF8 values with '?' or whatever. In practice, a UUID is usually also text, I only kept open the possibility of it containing invalid UTF-8 to avoid breaking parsing of strange UUIDs in git-annex branch files. So, I decided to let this edge case slip by. Have not updated the rest of the code base yet for this change, as the change took 2.5 hours longer than I expected to get working properly.
This commit is contained in:
parent
a80922a594
commit
16c798b5ef
3 changed files with 88 additions and 60 deletions
|
@ -1,11 +1,12 @@
|
|||
{- git-annex general metadata
|
||||
-
|
||||
- Copyright 2014-2018 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2014-2019 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU GPL version 3 or higher.
|
||||
-}
|
||||
|
||||
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
|
||||
module Types.MetaData (
|
||||
MetaData(..),
|
||||
|
@ -51,11 +52,14 @@ import Utility.Aeson
|
|||
import Types.UUID
|
||||
|
||||
import qualified Data.Text as T
|
||||
import qualified Data.Text.Encoding as E
|
||||
import qualified Data.Set as S
|
||||
import qualified Data.Map.Strict as M
|
||||
import qualified Data.HashMap.Strict as HM
|
||||
import Data.Char
|
||||
import qualified Data.CaseInsensitive as CI
|
||||
import qualified Data.ByteString as B
|
||||
import qualified Data.ByteString.Char8 as B8
|
||||
|
||||
newtype MetaData = MetaData (M.Map MetaField (S.Set MetaValue))
|
||||
deriving (Show, Eq, Ord)
|
||||
|
@ -63,14 +67,14 @@ newtype MetaData = MetaData (M.Map MetaField (S.Set MetaValue))
|
|||
instance ToJSON' MetaData where
|
||||
toJSON' (MetaData m) = object $ map go (M.toList m)
|
||||
where
|
||||
go (MetaField f, s) = (packString (CI.original f), toJSON' s)
|
||||
go (MetaField f, s) = (CI.original f, toJSON' s)
|
||||
|
||||
instance FromJSON MetaData where
|
||||
parseJSON (Object o) = do
|
||||
l <- HM.toList <$> parseJSON (Object o)
|
||||
MetaData . M.fromList <$> mapM go l
|
||||
where
|
||||
go (t, l) = case mkMetaField (T.unpack t) of
|
||||
go (t, l) = case mkMetaField t of
|
||||
Left e -> fail e
|
||||
Right f -> (,) <$> pure f <*> parseJSON l
|
||||
parseJSON _ = fail "expected an object"
|
||||
|
@ -81,17 +85,18 @@ newtype CurrentlySet = CurrentlySet Bool
|
|||
deriving (Read, Show, Eq, Ord, Arbitrary)
|
||||
|
||||
{- Fields are case insensitive. -}
|
||||
newtype MetaField = MetaField (CI.CI String)
|
||||
newtype MetaField = MetaField (CI.CI T.Text)
|
||||
deriving (Read, Show, Eq, Ord)
|
||||
|
||||
data MetaValue = MetaValue CurrentlySet String
|
||||
data MetaValue = MetaValue CurrentlySet B.ByteString
|
||||
deriving (Read, Show)
|
||||
|
||||
instance ToJSON' MetaValue where
|
||||
toJSON' (MetaValue _ v) = toJSON' v
|
||||
|
||||
instance FromJSON MetaValue where
|
||||
parseJSON (String v) = return $ MetaValue (CurrentlySet True) (T.unpack v)
|
||||
parseJSON (String v) = return $
|
||||
MetaValue (CurrentlySet True) (E.encodeUtf8 v)
|
||||
parseJSON _ = fail "expected a string"
|
||||
|
||||
{- Metadata values compare and order the same whether currently set or not. -}
|
||||
|
@ -105,14 +110,16 @@ instance Ord MetaValue where
|
|||
- field1 +val1 +val2 -val3 field2 +val4 +val5
|
||||
-}
|
||||
class MetaSerializable v where
|
||||
serialize :: v -> String
|
||||
deserialize :: String -> Maybe v
|
||||
serialize :: v -> B.ByteString
|
||||
deserialize :: B.ByteString -> Maybe v
|
||||
|
||||
instance MetaSerializable MetaData where
|
||||
serialize (MetaData m) = unwords $ concatMap go $ M.toList m
|
||||
serialize (MetaData m) = B8.unwords $ concatMap go $ M.toList m
|
||||
where
|
||||
go (f, vs) = serialize f : map serialize (S.toList vs)
|
||||
deserialize = Just . getfield emptyMetaData . words
|
||||
-- Note that B8.words cannot be used here, because UTF-8 encoded
|
||||
-- field names may contain bytes such as \160 that are whitespace.
|
||||
deserialize = Just . getfield emptyMetaData . B8.split ' '
|
||||
where
|
||||
getfield m [] = m
|
||||
getfield m (w:ws) = maybe m (getvalues m ws) (deserialize w)
|
||||
|
@ -122,23 +129,25 @@ instance MetaSerializable MetaData where
|
|||
Nothing -> getfield m l
|
||||
|
||||
instance MetaSerializable MetaField where
|
||||
serialize (MetaField f) = CI.original f
|
||||
deserialize = Just . mkMetaFieldUnchecked
|
||||
serialize (MetaField f) = E.encodeUtf8 (CI.original f)
|
||||
deserialize = MetaField . CI.mk <$$> eitherToMaybe . E.decodeUtf8'
|
||||
|
||||
{- Base64 problematic values. -}
|
||||
instance MetaSerializable MetaValue where
|
||||
serialize (MetaValue isset v) =
|
||||
serialize isset ++
|
||||
if any isSpace v || "!" `isPrefixOf` v
|
||||
then '!' : toB64 v
|
||||
serialize isset <>
|
||||
if B8.any (== ' ') v || "!" `B8.isPrefixOf` v
|
||||
then "!" <> toB64' v
|
||||
else v
|
||||
deserialize (isset:'!':v) = MetaValue
|
||||
<$> deserialize [isset]
|
||||
<*> fromB64Maybe v
|
||||
deserialize (isset:v) = MetaValue
|
||||
<$> deserialize [isset]
|
||||
<*> pure v
|
||||
deserialize [] = Nothing
|
||||
deserialize b = do
|
||||
(isset, b') <- B8.uncons b
|
||||
case B8.uncons b' of
|
||||
Just ('!', b'') -> MetaValue
|
||||
<$> deserialize (B8.singleton isset)
|
||||
<*> fromB64Maybe' b''
|
||||
_ -> MetaValue
|
||||
<$> deserialize (B8.singleton isset)
|
||||
<*> pure b'
|
||||
|
||||
instance MetaSerializable CurrentlySet where
|
||||
serialize (CurrentlySet True) = "+"
|
||||
|
@ -147,17 +156,17 @@ instance MetaSerializable CurrentlySet where
|
|||
deserialize "-" = Just (CurrentlySet False)
|
||||
deserialize _ = Nothing
|
||||
|
||||
mkMetaField :: String -> Either String MetaField
|
||||
mkMetaField :: T.Text -> Either String MetaField
|
||||
mkMetaField f = maybe (Left $ badField f) Right (toMetaField f)
|
||||
|
||||
badField :: String -> String
|
||||
badField f = "Illegal metadata field name, \"" ++ f ++ "\""
|
||||
badField :: T.Text -> String
|
||||
badField f = "Illegal metadata field name, \"" ++ T.unpack f ++ "\""
|
||||
|
||||
{- Does not check that the field name is valid. Use with caution. -}
|
||||
mkMetaFieldUnchecked :: String -> MetaField
|
||||
mkMetaFieldUnchecked :: T.Text -> MetaField
|
||||
mkMetaFieldUnchecked = MetaField . CI.mk
|
||||
|
||||
toMetaField :: String -> Maybe MetaField
|
||||
toMetaField :: T.Text -> Maybe MetaField
|
||||
toMetaField f
|
||||
| legalField f = Just $ MetaField $ CI.mk f
|
||||
| otherwise = Nothing
|
||||
|
@ -168,23 +177,29 @@ toMetaField f
|
|||
- Additionally, fields should not contain any form of path separator, as
|
||||
- that would break views.
|
||||
-
|
||||
- And, fields need to be valid JSON keys.
|
||||
-
|
||||
- So, require they have an alphanumeric first letter, with the remainder
|
||||
- being either alphanumeric or a small set of whitelisted common punctuation.
|
||||
-}
|
||||
legalField :: String -> Bool
|
||||
legalField [] = False
|
||||
legalField (c1:cs)
|
||||
| not (isAlphaNum c1) = False
|
||||
| otherwise = all legalchars cs
|
||||
legalField :: T.Text -> Bool
|
||||
legalField t = case T.uncons t of
|
||||
Nothing -> False
|
||||
Just (c1, t')
|
||||
| not (isAlphaNum c1) -> False
|
||||
| otherwise -> T.all legalchars t'
|
||||
where
|
||||
legalchars c
|
||||
| isAlphaNum c = True
|
||||
| otherwise = c `elem` "_-."
|
||||
| otherwise = c `elem` legalFieldWhiteList
|
||||
|
||||
toMetaValue :: String -> MetaValue
|
||||
legalFieldWhiteList :: [Char]
|
||||
legalFieldWhiteList = "_-."
|
||||
|
||||
toMetaValue :: B.ByteString -> MetaValue
|
||||
toMetaValue = MetaValue (CurrentlySet True)
|
||||
|
||||
mkMetaValue :: CurrentlySet -> String -> MetaValue
|
||||
mkMetaValue :: CurrentlySet -> B.ByteString -> MetaValue
|
||||
mkMetaValue = MetaValue
|
||||
|
||||
unsetMetaValue :: MetaValue -> MetaValue
|
||||
|
@ -194,10 +209,10 @@ unsetMetaValue (MetaValue _ s) = MetaValue (CurrentlySet False) s
|
|||
unsetMetaData :: MetaData -> MetaData
|
||||
unsetMetaData (MetaData m) = MetaData $ M.map (S.map unsetMetaValue) m
|
||||
|
||||
fromMetaField :: MetaField -> String
|
||||
fromMetaField :: MetaField -> T.Text
|
||||
fromMetaField (MetaField f) = CI.original f
|
||||
|
||||
fromMetaValue :: MetaValue -> String
|
||||
fromMetaValue :: MetaValue -> B.ByteString
|
||||
fromMetaValue (MetaValue _ f) = f
|
||||
|
||||
fromMetaData :: MetaData -> [(MetaField, S.Set MetaValue)]
|
||||
|
@ -296,26 +311,26 @@ extractRemoteMetaData :: UUID -> MetaData -> RemoteMetaData
|
|||
extractRemoteMetaData u (MetaData m) = RemoteMetaData u $ MetaData $
|
||||
M.mapKeys removeprefix $ M.filterWithKey belongsremote m
|
||||
where
|
||||
belongsremote (MetaField f) _v = prefix `isPrefixOf` CI.original f
|
||||
belongsremote (MetaField f) _v = prefix `T.isPrefixOf` CI.original f
|
||||
removeprefix (MetaField f) = MetaField $
|
||||
CI.mk $ drop prefixlen $ CI.original f
|
||||
CI.mk $ T.drop prefixlen $ CI.original f
|
||||
prefix = remoteMetaDataPrefix u
|
||||
prefixlen = length prefix
|
||||
prefixlen = T.length prefix
|
||||
|
||||
splitRemoteMetaDataField :: MetaField -> Maybe (UUID, MetaField)
|
||||
splitRemoteMetaDataField (MetaField f) = do
|
||||
let (su, sf) = separate (== ':') (CI.original f)
|
||||
f' <- toMetaField sf
|
||||
return $ (toUUID su, f')
|
||||
let (su, sf) = T.break (== ':') (CI.original f)
|
||||
f' <- toMetaField ((T.drop 1 sf))
|
||||
return $ (toUUID (T.unpack su), f')
|
||||
|
||||
remoteMetaDataPrefix :: UUID -> String
|
||||
remoteMetaDataPrefix u = fromUUID u ++ ":"
|
||||
remoteMetaDataPrefix :: UUID -> T.Text
|
||||
remoteMetaDataPrefix u = T.pack (fromUUID u) <> ":"
|
||||
|
||||
fromRemoteMetaData :: RemoteMetaData -> MetaData
|
||||
fromRemoteMetaData (RemoteMetaData u (MetaData m)) = MetaData $
|
||||
M.mapKeys addprefix m
|
||||
where
|
||||
addprefix (MetaField f) = MetaField $ CI.mk $ (prefix ++) $ CI.original f
|
||||
addprefix (MetaField f) = MetaField $ CI.mk $ prefix <> CI.original f
|
||||
prefix = remoteMetaDataPrefix u
|
||||
|
||||
{- Avoid putting too many fields in the map; extremely large maps make
|
||||
|
@ -334,11 +349,11 @@ instance Arbitrary MetaValue where
|
|||
-- Avoid non-ascii metavalues because fully arbitrary
|
||||
-- strings may not be encoded using the filesystem
|
||||
-- encoding, which is norally applied to all input.
|
||||
<*> arbitrary `suchThat` all isAscii
|
||||
<*> (encodeBS <$> arbitrary `suchThat` all isAscii)
|
||||
|
||||
instance Arbitrary MetaField where
|
||||
arbitrary = MetaField . CI.mk
|
||||
<$> arbitrary `suchThat` legalField
|
||||
<$> (T.pack <$> arbitrary) `suchThat` legalField
|
||||
|
||||
prop_metadata_sane :: MetaData -> MetaField -> MetaValue -> Bool
|
||||
prop_metadata_sane m f v = and
|
||||
|
|
|
@ -44,6 +44,9 @@ encode = Data.Aeson.encode . toJSON'
|
|||
class ToJSON' a where
|
||||
toJSON' :: a -> Value
|
||||
|
||||
instance ToJSON' T.Text where
|
||||
toJSON' = toJSON
|
||||
|
||||
instance ToJSON' String where
|
||||
toJSON' = toJSON . packString
|
||||
|
||||
|
|
|
@ -1,39 +1,49 @@
|
|||
{- Simple Base64 encoding of Strings
|
||||
{- Simple Base64 encoding
|
||||
-
|
||||
- Note that this uses the FileSystemEncoding, so it can be used on Strings
|
||||
- that repesent filepaths containing arbitrarily encoded characters.
|
||||
-
|
||||
- Copyright 2011, 2015 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2011-2019 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- License: BSD-2-clause
|
||||
-}
|
||||
|
||||
module Utility.Base64 (toB64, fromB64Maybe, fromB64, prop_b64_roundtrips) where
|
||||
module Utility.Base64 where
|
||||
|
||||
import Utility.FileSystemEncoding
|
||||
|
||||
import qualified "sandi" Codec.Binary.Base64 as B64
|
||||
import Data.Maybe
|
||||
import qualified Data.ByteString.Lazy as L
|
||||
import qualified Data.ByteString as B
|
||||
import Data.ByteString.UTF8 (fromString, toString)
|
||||
import Data.Char
|
||||
|
||||
-- | This uses the FileSystemEncoding, so it can be used on Strings
|
||||
-- that repesent filepaths containing arbitrarily encoded characters.
|
||||
toB64 :: String -> String
|
||||
toB64 = toString . B64.encode . encodeBS
|
||||
|
||||
toB64' :: B.ByteString -> B.ByteString
|
||||
toB64' = B64.encode
|
||||
|
||||
fromB64Maybe :: String -> Maybe String
|
||||
fromB64Maybe s = either (const Nothing) (Just . decodeBL . L.fromStrict)
|
||||
fromB64Maybe s = either (const Nothing) (Just . decodeBS)
|
||||
(B64.decode $ fromString s)
|
||||
|
||||
fromB64Maybe' :: B.ByteString -> Maybe (B.ByteString)
|
||||
fromB64Maybe' = either (const Nothing) Just . B64.decode
|
||||
|
||||
fromB64 :: String -> String
|
||||
fromB64 = fromMaybe bad . fromB64Maybe
|
||||
where
|
||||
bad = error "bad base64 encoded data"
|
||||
|
||||
fromB64' :: B.ByteString -> B.ByteString
|
||||
fromB64' = fromMaybe bad . fromB64Maybe'
|
||||
where
|
||||
bad = error "bad base64 encoded data"
|
||||
|
||||
-- Only ascii strings are tested, because an arbitrary string may contain
|
||||
-- characters not encoded using the FileSystemEncoding, which would thus
|
||||
-- not roundtrip, as fromB64 always generates an output encoded that way.
|
||||
-- not roundtrip, as decodeBS always generates an output encoded that way.
|
||||
prop_b64_roundtrips :: String -> Bool
|
||||
prop_b64_roundtrips s
|
||||
| all (isAscii) s = s == fromB64 (toB64 s)
|
||||
| all (isAscii) s = s == decodeBS (fromB64' (toB64' (encodeBS s)))
|
||||
| otherwise = True
|
||||
|
|
Loading…
Add table
Reference in a new issue