From 89e1a05a8f62ac6ac3ae3f24be0165fe1fcc6f75 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 16 Apr 2018 15:42:45 -0400 Subject: [PATCH] Fix mangling of --json output of utf-8 characters when not running in a utf-8 locale As long as all code imports Utility.Aeson rather than Data.Aeson, and no Strings that may contain utf-8 characters are used for eg, object keys via T.pack, this is guaranteed to fix the problem everywhere that git-annex generates json. It's kind of annoying to need to wrap ToJSON with a ToJSON', especially since every data type that has a ToJSON instance has to be ported over. However, that only took 50 lines of code, which is worth it to ensure full coverage. I initially tried an alternative approach of a newtype FileEncoded, which had to be used everywhere a String was fed into aeson, and chasing down all the sites would have been far too hard. Did consider creating an intentionally overlapping instance ToJSON String, and letting ghc fail to build anything that passed in a String, but am not sure that wouldn't pollute some library that git-annex depends on that happens to use ToJSON String internally. This commit was supported by the NSF-funded DataLad project. --- CHANGELOG | 2 + Command/Info.hs | 25 +++--- Command/MetaData.hs | 8 +- Command/Whereis.hs | 3 +- Key.hs | 6 +- Messages/JSON.hs | 43 +++++----- Remote.hs | 16 ++-- Remote/Tahoe.hs | 2 +- Test.hs | 4 +- Types/Messages.hs | 2 +- Types/MetaData.hs | 12 +-- Utility/Aeson.hs | 86 +++++++++++++++++++ ...n_should_be_utf8_regardless_of_locale.mdwn | 25 +++++- git-annex.cabal | 1 + 14 files changed, 173 insertions(+), 62 deletions(-) create mode 100644 Utility/Aeson.hs diff --git a/CHANGELOG b/CHANGELOG index b9701f3546..126a80dc69 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -7,6 +7,8 @@ git-annex (6.20180410) UNRELEASED; urgency=medium Note that it's still allowed to move the content of a file from one repository to another when numcopies is not satisfied, as long as the move does not result in there being fewer copies. + * Fix mangling of --json output of utf-8 characters when not + running in a utf-8 locale. -- Joey Hess Mon, 09 Apr 2018 14:03:28 -0400 diff --git a/Command/Info.hs b/Command/Info.hs index 6ec6a1f71e..cc9c1b5fe0 100644 --- a/Command/Info.hs +++ b/Command/Info.hs @@ -11,9 +11,8 @@ module Command.Info where import "mtl" Control.Monad.State.Strict import qualified Data.Map.Strict as M -import qualified Data.Text as T +import qualified Data.Vector as V import Data.Ord -import Data.Aeson hiding (json) import Command import qualified Git @@ -34,6 +33,7 @@ import Config import Git.Config (boolConfig) import qualified Git.LsTree as LsTree import Utility.Percentage +import Utility.Aeson hiding (json) import Types.Transfer import Logs.Transfer import Types.Key @@ -283,7 +283,7 @@ simpleStat desc getval = stat desc $ json id getval nostat :: Stat nostat = return Nothing -json :: ToJSON j => (j -> String) -> StatState j -> String -> StatState String +json :: ToJSON' j => (j -> String) -> StatState j -> String -> StatState String json fmt a desc = do j <- a lift $ maybeShowJSON $ JSONChunk [(desc, j)] @@ -422,7 +422,7 @@ transfer_list :: Stat transfer_list = stat desc $ nojson $ lift $ do uuidmap <- Remote.remoteMap id ts <- getTransfers - maybeShowJSON $ JSONChunk [(desc, map (uncurry jsonify) ts)] + maybeShowJSON $ JSONChunk [(desc, V.fromList $ map (uncurry jsonify) ts)] return $ if null ts then "none" else multiLine $ @@ -438,11 +438,11 @@ transfer_list = stat desc $ nojson $ lift $ do , maybe (fromUUID $ transferUUID t) Remote.name $ M.lookup (transferUUID t) uuidmap ] - jsonify t i = object $ map (\(k, v) -> (T.pack k, v)) $ - [ ("transfer", toJSON (formatDirection (transferDirection t))) - , ("key", toJSON (key2file (transferKey t))) - , ("file", toJSON afile) - , ("remote", toJSON (fromUUID (transferUUID t))) + jsonify t i = object $ map (\(k, v) -> (packString k, v)) $ + [ ("transfer", toJSON' (formatDirection (transferDirection t))) + , ("key", toJSON' (transferKey t)) + , ("file", toJSON' afile) + , ("remote", toJSON' (fromUUID (transferUUID t))) ] where AssociatedFile afile = associatedFile i @@ -476,10 +476,13 @@ numcopies_stats :: Stat numcopies_stats = stat "numcopies stats" $ json fmt $ calc <$> (maybe M.empty numCopiesVarianceMap <$> cachedNumCopiesStats) where - calc = map (\(variance, count) -> (show variance, count)) + calc = V.fromList + . map (\(variance, count) -> (show variance, count)) . sortBy (flip (comparing fst)) . M.toList - fmt = multiLine . map (\(variance, count) -> "numcopies " ++ variance ++ ": " ++ show count) + fmt = multiLine + . map (\(variance, count) -> "numcopies " ++ variance ++ ": " ++ show count) + . V.toList reposizes_stats :: Stat reposizes_stats = stat desc $ nojson $ do diff --git a/Command/MetaData.hs b/Command/MetaData.hs index ef3f1da9a5..282b7fda05 100644 --- a/Command/MetaData.hs +++ b/Command/MetaData.hs @@ -14,12 +14,12 @@ import Logs.MetaData import Annex.WorkTree import Messages.JSON (JSONActionItem(..)) import Types.Messages +import Utility.Aeson import qualified Data.Set as S import qualified Data.Map as M import qualified Data.Text as T import qualified Data.ByteString.Lazy.UTF8 as BU -import Data.Aeson import Control.Concurrent cmd :: Command @@ -115,7 +115,7 @@ perform c o k = case getSet o of cleanup :: Key -> CommandCleanup cleanup k = do m <- getCurrentMetaData k - let Object o = toJSON (MetaDataFields m) + let Object o = toJSON' (MetaDataFields m) maybeShowJSON $ AesonObject o showLongNote $ unlines $ concatMap showmeta $ map unwrapmeta (fromMetaData m) @@ -129,8 +129,8 @@ cleanup k = do newtype MetaDataFields = MetaDataFields MetaData deriving (Show) -instance ToJSON MetaDataFields where - toJSON (MetaDataFields m) = object [ (fieldsField, toJSON m) ] +instance ToJSON' MetaDataFields where + toJSON' (MetaDataFields m) = object [ (fieldsField, toJSON' m) ] instance FromJSON MetaDataFields where parseJSON (Object v) = do diff --git a/Command/Whereis.hs b/Command/Whereis.hs index fddb2b5c6e..b14e231c17 100644 --- a/Command/Whereis.hs +++ b/Command/Whereis.hs @@ -15,6 +15,7 @@ import Remote.Web (getWebUrls) import Annex.UUID import qualified Data.Map as M +import qualified Data.Vector as V cmd :: Command cmd = noCommit $ withGlobalOptions [jsonOptions, annexedMatchingOptions] $ @@ -77,7 +78,7 @@ perform remotemap key = do untrustedheader = "The following untrusted locations may also have copies:\n" ppwhereis h ls urls = do descm <- uuidDescriptions - let urlvals = map (\(u, us) -> (u, Just us)) $ + let urlvals = map (\(u, us) -> (u, Just (V.fromList us))) $ filter (\(u,_) -> u `elem` ls) urls prettyPrintUUIDsWith (Just "urls") h descm (const Nothing) urlvals diff --git a/Key.hs b/Key.hs index 44e9acea4e..ade012a4ba 100644 --- a/Key.hs +++ b/Key.hs @@ -22,7 +22,6 @@ module Key ( prop_isomorphic_key_decode ) where -import Data.Aeson import Data.Char import qualified Data.Text as T @@ -30,6 +29,7 @@ import Common import Types.Key import Utility.QuickCheck import Utility.Bloom +import Utility.Aeson import qualified Utility.SimpleProtocol as Proto stubKey :: Key @@ -155,8 +155,8 @@ instance Hashable Key where hashIO32 = hashIO32 . key2file hashIO64 = hashIO64 . key2file -instance ToJSON Key where - toJSON = toJSON . key2file +instance ToJSON' Key where + toJSON' = toJSON' . key2file instance FromJSON Key where parseJSON (String t) = maybe mempty pure $ file2key $ T.unpack t diff --git a/Messages/JSON.hs b/Messages/JSON.hs index 897eb8cbfe..e63a263e3f 100644 --- a/Messages/JSON.hs +++ b/Messages/JSON.hs @@ -26,12 +26,10 @@ module Messages.JSON ( JSONActionItem(..), ) where -import Data.Aeson import Control.Applicative import qualified Data.Map as M -import qualified Data.Text as T import qualified Data.Vector as V -import qualified Data.ByteString.Lazy as B +import qualified Data.ByteString.Lazy as L import qualified Data.HashMap.Strict as HM import System.IO import System.IO.Unsafe (unsafePerformIO) @@ -44,6 +42,7 @@ import Types.Messages import Key import Utility.Metered import Utility.Percentage +import Utility.Aeson -- A global lock to avoid concurrent threads emitting json at the same time. {-# NOINLINE emitLock #-} @@ -53,7 +52,7 @@ emitLock = unsafePerformIO $ newMVar () emit :: Object -> IO () emit o = do takeMVar emitLock - B.hPut stdout (encode o) + L.hPut stdout (encode o) putStr "\n" putMVar emitLock () @@ -67,7 +66,7 @@ none = id start :: String -> Maybe FilePath -> Maybe Key -> JSONBuilder start command file key _ = Just (o, False) where - Object o = toJSON $ JSONActionItem + Object o = toJSON' $ JSONActionItem { itemCommand = Just command , itemKey = key , itemFile = file @@ -75,7 +74,7 @@ start command file key _ = Just (o, False) } end :: Bool -> JSONBuilder -end b (Just (o, _)) = Just (HM.insert "success" (toJSON b) o, True) +end b (Just (o, _)) = Just (HM.insert "success" (toJSON' b) o, True) end _ Nothing = Nothing finalize :: JSONOptions -> Object -> Object @@ -91,32 +90,32 @@ addErrorMessage msg o = where combinearray (Array new) (Array old) = Array (old <> new) combinearray new _old = new - v = Array $ V.fromList $ map (String . T.pack) msg + v = Array $ V.fromList $ map (String . packString) msg note :: String -> JSONBuilder note _ Nothing = Nothing -note s (Just (o, e)) = Just (HM.insertWith combinelines "note" (toJSON s) o, e) +note s (Just (o, e)) = Just (HM.insertWith combinelines "note" (toJSON' s) o, e) where combinelines (String new) (String old) = - String (old <> T.pack "\n" <> new) + String (old <> "\n" <> new) combinelines new _old = new info :: String -> JSONBuilder info s _ = Just (o, True) where - Object o = object ["info" .= toJSON s] + Object o = object ["info" .= toJSON' s] data JSONChunk v where AesonObject :: Object -> JSONChunk Object - JSONChunk :: ToJSON v => [(String, v)] -> JSONChunk [(String, v)] + JSONChunk :: ToJSON' v => [(String, v)] -> JSONChunk [(String, v)] add :: JSONChunk v -> JSONBuilder add v (Just (o, e)) = Just (HM.union o' o, e) where Object o' = case v of AesonObject ao -> Object ao - JSONChunk l -> object (map mkPair l) - mkPair (s, d) = (T.pack s, toJSON d) + JSONChunk l -> object $ map mkPair l + mkPair (s, d) = (packString s, toJSON' d) add _ Nothing = Nothing complete :: JSONChunk v -> JSONBuilder @@ -145,8 +144,8 @@ data DualDisp = DualDisp , dispJson :: String } -instance ToJSON DualDisp where - toJSON = toJSON . dispJson +instance ToJSON' DualDisp where + toJSON' = toJSON' . dispJson instance Show DualDisp where show = dispNormal @@ -156,10 +155,10 @@ instance Show DualDisp where -- serialization of Map, which uses "[key, value]". data ObjectMap a = ObjectMap { fromObjectMap :: M.Map String a } -instance ToJSON a => ToJSON (ObjectMap a) where - toJSON (ObjectMap m) = object $ map go $ M.toList m +instance ToJSON' a => ToJSON' (ObjectMap a) where + toJSON' (ObjectMap m) = object $ map go $ M.toList m where - go (k, v) = (T.pack k, toJSON v) + go (k, v) = (packString k, toJSON' v) -- An item that a git-annex command acts on, and displays a JSON object about. data JSONActionItem a = JSONActionItem @@ -170,13 +169,13 @@ data JSONActionItem a = JSONActionItem } deriving (Show) -instance ToJSON (JSONActionItem a) where - toJSON i = object $ catMaybes +instance ToJSON' (JSONActionItem a) where + toJSON' i = object $ catMaybes [ Just $ "command" .= itemCommand i , case itemKey i of Nothing -> Nothing - Just k -> Just $ "key" .= toJSON k - , Just $ "file" .= itemFile i + Just k -> Just $ "key" .= toJSON' k + , Just $ "file" .= toJSON' (itemFile i) -- itemAdded is not included; must be added later by 'add' ] diff --git a/Remote.hs b/Remote.hs index 8d826712c1..29f59a7bf8 100644 --- a/Remote.hs +++ b/Remote.hs @@ -57,9 +57,8 @@ module Remote ( ) where import Data.Ord -import Data.Aeson import qualified Data.Map as M -import qualified Data.Text as T +import qualified Data.Vector as V import Annex.Common import Types.Remote @@ -74,6 +73,7 @@ import Config import Config.DynamicConfig import Git.Types (RemoteName) import qualified Git +import Utility.Aeson {- Map from UUIDs of Remotes to a calculated value. -} remoteMap :: (Remote -> v) -> Annex (M.Map UUID v) @@ -197,7 +197,7 @@ prettyPrintUUIDsDescs header descm uuids = {- An optional field can be included in the list of UUIDs. -} prettyPrintUUIDsWith - :: ToJSON v + :: ToJSON' v => Maybe String -> String -> M.Map UUID RemoteName @@ -206,7 +206,7 @@ prettyPrintUUIDsWith -> Annex String prettyPrintUUIDsWith optfield header descm showval uuidvals = do hereu <- getUUID - maybeShowJSON $ JSONChunk [(header, map (jsonify hereu) uuidvals)] + maybeShowJSON $ JSONChunk [(header, V.fromList $ map (jsonify hereu) uuidvals)] return $ unwords $ map (\u -> "\t" ++ prettify hereu u ++ "\n") uuidvals where finddescription u = M.findWithDefault "" u descm @@ -224,11 +224,11 @@ prettyPrintUUIDsWith optfield header descm showval uuidvals = do Nothing -> s Just val -> val ++ ": " ++ s jsonify hereu (u, optval) = object $ catMaybes - [ Just (T.pack "uuid", toJSON $ fromUUID u) - , Just (T.pack "description", toJSON $ finddescription u) - , Just (T.pack "here", toJSON $ hereu == u) + [ Just (packString "uuid", toJSON' $ fromUUID u) + , Just (packString "description", toJSON' $ finddescription u) + , Just (packString "here", toJSON' $ hereu == u) , case (optfield, optval) of - (Just field, Just val) -> Just (T.pack field, toJSON val) + (Just field, Just val) -> Just (packString field, toJSON' val) _ -> Nothing ] diff --git a/Remote/Tahoe.hs b/Remote/Tahoe.hs index 0091f27ba3..6423fefdb1 100644 --- a/Remote/Tahoe.hs +++ b/Remote/Tahoe.hs @@ -23,7 +23,7 @@ module Remote.Tahoe (remote) where import qualified Data.Map as M -import Data.Aeson +import Utility.Aeson import Data.ByteString.Lazy.UTF8 (fromString) import Control.Concurrent.STM diff --git a/Test.hs b/Test.hs index b0f4186563..9bd43bbe69 100644 --- a/Test.hs +++ b/Test.hs @@ -21,7 +21,6 @@ import Test.Tasty.Ingredients.Rerun import Options.Applicative (switch, long, help, internal) import qualified Data.Map as M -import qualified Data.Aeson import qualified Data.ByteString.Lazy.UTF8 as BU8 import System.Environment @@ -83,6 +82,7 @@ import qualified Utility.HumanTime import qualified Utility.Base64 import qualified Utility.Tmp.Dir import qualified Utility.FileSystemEncoding +import qualified Utility.Aeson #ifndef mingw32_HOST_OS import qualified Remote.Helper.Encryptable import qualified Types.Crypto @@ -971,7 +971,7 @@ test_merge = intmpclonerepo $ test_info :: Assertion test_info = intmpclonerepo $ do json <- BU8.fromString <$> git_annex_output "info" ["--json"] - case Data.Aeson.eitherDecode json :: Either String Data.Aeson.Value of + case Utility.Aeson.eitherDecode json :: Either String Utility.Aeson.Value of Right _ -> return () Left e -> assertFailure e diff --git a/Types/Messages.hs b/Types/Messages.hs index d45174bb71..8ca60651f6 100644 --- a/Types/Messages.hs +++ b/Types/Messages.hs @@ -9,7 +9,7 @@ module Types.Messages where -import qualified Data.Aeson as Aeson +import qualified Utility.Aeson as Aeson import Control.Concurrent #ifdef WITH_CONCURRENTOUTPUT diff --git a/Types/MetaData.hs b/Types/MetaData.hs index 279aacbb84..e05a8f72ec 100644 --- a/Types/MetaData.hs +++ b/Types/MetaData.hs @@ -43,6 +43,7 @@ module Types.MetaData ( import Common import Utility.Base64 import Utility.QuickCheck +import Utility.Aeson import qualified Data.Text as T import qualified Data.Set as S @@ -50,15 +51,14 @@ import qualified Data.Map as M import qualified Data.HashMap.Strict as HM import Data.Char import qualified Data.CaseInsensitive as CI -import Data.Aeson newtype MetaData = MetaData (M.Map MetaField (S.Set MetaValue)) deriving (Show, Eq, Ord) -instance ToJSON MetaData where - toJSON (MetaData m) = object $ map go (M.toList m) +instance ToJSON' MetaData where + toJSON' (MetaData m) = object $ map go (M.toList m) where - go (MetaField f, s) = (T.pack (CI.original f), toJSON s) + go (MetaField f, s) = (packString (CI.original f), toJSON' s) instance FromJSON MetaData where parseJSON (Object o) = do @@ -82,8 +82,8 @@ newtype MetaField = MetaField (CI.CI String) data MetaValue = MetaValue CurrentlySet String deriving (Read, Show) -instance ToJSON MetaValue where - toJSON (MetaValue _ v) = toJSON v +instance ToJSON' MetaValue where + toJSON' (MetaValue _ v) = toJSON' v instance FromJSON MetaValue where parseJSON (String v) = return $ MetaValue (CurrentlySet True) (T.unpack v) diff --git a/Utility/Aeson.hs b/Utility/Aeson.hs new file mode 100644 index 0000000000..7147e516bb --- /dev/null +++ b/Utility/Aeson.hs @@ -0,0 +1,86 @@ +{- GHC File system encoding support for Aeson. + - + - Import instead of Data.Aeson + - + - Copyright 2018 Joey Hess + - + - License: BSD-2-clause + -} + +{-# LANGUAGE FlexibleInstances, TypeSynonymInstances #-} + +module Utility.Aeson ( + module X, + ToJSON'(..), + encode, + packString, +) where + +import Data.Aeson as X hiding (ToJSON, toJSON, encode) +import Data.Aeson hiding (encode) +import qualified Data.Aeson +import qualified Data.Text as T +import qualified Data.Text.Encoding as T +import qualified Data.ByteString.Lazy as L +import qualified Data.ByteString as S +import qualified Data.Set +import qualified Data.Vector +import Prelude + +import Utility.FileSystemEncoding + +-- | Use this instead of Data.Aeson.encode to make sure that the +-- below String instance is used. +encode :: ToJSON' a => a -> L.ByteString +encode = Data.Aeson.encode . toJSON' + +-- | Aeson has an unfortunate ToJSON instance for Char and [Char] +-- which does not support Strings containing UTF8 characters +-- encoded using the filesystem encoding when run in a non-utf8 locale. +-- +-- Since we can't replace that with a instance that does the right +-- thing, instead here's a new class that handles String right. +class ToJSON' a where + toJSON' :: a -> Value + +instance ToJSON' String where + toJSON' = toJSON . packString + +-- | Pack a String to Text, correctly handling the filesystem encoding. +-- +-- Use this instead of Data.Text.pack. +-- +-- Note that if the string contains invalid UTF8 characters not using +-- the FileSystemEncoding, this is the same as Data.Text.pack. +packString :: String -> T.Text +packString s = case T.decodeUtf8' (S.concat $ L.toChunks $ encodeBS s) of + Right t -> t + Left _ -> T.pack s + +-- | An instance for lists cannot be included as it would overlap with +-- the String instance. Instead, you can use a Vector. +instance ToJSON' s => ToJSON' (Data.Vector.Vector s) where + toJSON' = toJSON . map toJSON' . Data.Vector.toList + +-- Aeson generates the same JSON for a Set as for a list. +instance ToJSON' s => ToJSON' (Data.Set.Set s) where + toJSON' = toJSON . map toJSON' . Data.Set.toList + +instance (ToJSON' a, ToJSON a) => ToJSON' (Maybe a) where + toJSON' (Just a) = toJSON (Just (toJSON' a)) + toJSON' v@Nothing = toJSON v + +instance (ToJSON' a, ToJSON a, ToJSON' b, ToJSON b) => ToJSON' (a, b) where + toJSON' (a, b) = toJSON ((toJSON' a, toJSON' b)) + +instance ToJSON' Bool where + toJSON' = toJSON + +instance ToJSON' Integer where + toJSON' = toJSON + +instance ToJSON' Object where + toJSON' = toJSON + +instance ToJSON' Value where + toJSON' = toJSON diff --git a/doc/bugs/json_should_be_utf8_regardless_of_locale.mdwn b/doc/bugs/json_should_be_utf8_regardless_of_locale.mdwn index 66d0de2244..ac7275a77a 100644 --- a/doc/bugs/json_should_be_utf8_regardless_of_locale.mdwn +++ b/doc/bugs/json_should_be_utf8_regardless_of_locale.mdwn @@ -2,6 +2,25 @@ json is defined as always utf-8. However, when LANG=C, git-annex --json currently outputs "file":"���������" instead of "file":"äöü東" for that utf-8 filename. --[[Joey]] -(Note that git-annex can operate on non-utf8 filenames; it's not defined -what the json contains then, which might or might not be considered a bug -but this is not about that.) +This can also affect keys when they contain some non-utf8 from eg the +extension. And metadata keys and values can contain non-utf8 and also get +converted to json with similar results. + +Note that git-annex can operate on non-utf8 filenames and keys; +it's not defined what the json contains then, and it currently contains +similar garbage. + +This happens because aeson's instance of ToJSON for Char uses +Text.singleton, and Text does not handle ghc's filesystem encoding +for String. Instead it defaults to `\65533` for each byte encoded with the +filesystem encoding. + +So, git-annex will need to convert filenames and keys and anything else +that might use the filesystem encoding to Text itself in some +way that does respect the filesystem encoding. Ie, use encodeBS to convert +it to a ByteString and then Data.Text.Encoding.decodeUtf8. + +> [[done]] that. --[[Joey]] + +What about git-annex commands that take json as input, +when run in a non-utf8 locale? Tested that, it is handled ok. --[[Joey]] diff --git a/git-annex.cabal b/git-annex.cabal index 4aa5bfcb8e..d2b2a06ffc 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -999,6 +999,7 @@ Executable git-annex Upgrade.V3 Upgrade.V4 Upgrade.V5 + Utility.Aeson Utility.Applicative Utility.AuthToken Utility.Base64