Fix mangling of --json output of utf-8 characters when not running in a utf-8 locale

As long as all code imports Utility.Aeson rather than Data.Aeson,
and no Strings that may contain utf-8 characters are used for eg, object
keys via T.pack, this is guaranteed to fix the problem everywhere that
git-annex generates json.

It's kind of annoying to need to wrap ToJSON with a ToJSON', especially
since every data type that has a ToJSON instance has to be ported over.
However, that only took 50 lines of code, which is worth it to ensure full
coverage. I initially tried an alternative approach of a newtype FileEncoded,
which had to be used everywhere a String was fed into aeson, and chasing
down all the sites would have been far too hard. Did consider creating an
intentionally overlapping instance ToJSON String, and letting ghc fail
to build anything that passed in a String, but am not sure that wouldn't
pollute some library that git-annex depends on that happens to use ToJSON
String internally.

This commit was supported by the NSF-funded DataLad project.
This commit is contained in:
Joey Hess 2018-04-16 15:42:45 -04:00
parent 6ddd374935
commit 89e1a05a8f
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
14 changed files with 173 additions and 62 deletions

View file

@ -7,6 +7,8 @@ git-annex (6.20180410) UNRELEASED; urgency=medium
Note that it's still allowed to move the content of a file
from one repository to another when numcopies is not satisfied, as long
as the move does not result in there being fewer copies.
* Fix mangling of --json output of utf-8 characters when not
running in a utf-8 locale.
-- Joey Hess <id@joeyh.name> Mon, 09 Apr 2018 14:03:28 -0400

View file

@ -11,9 +11,8 @@ module Command.Info where
import "mtl" Control.Monad.State.Strict
import qualified Data.Map.Strict as M
import qualified Data.Text as T
import qualified Data.Vector as V
import Data.Ord
import Data.Aeson hiding (json)
import Command
import qualified Git
@ -34,6 +33,7 @@ import Config
import Git.Config (boolConfig)
import qualified Git.LsTree as LsTree
import Utility.Percentage
import Utility.Aeson hiding (json)
import Types.Transfer
import Logs.Transfer
import Types.Key
@ -283,7 +283,7 @@ simpleStat desc getval = stat desc $ json id getval
nostat :: Stat
nostat = return Nothing
json :: ToJSON j => (j -> String) -> StatState j -> String -> StatState String
json :: ToJSON' j => (j -> String) -> StatState j -> String -> StatState String
json fmt a desc = do
j <- a
lift $ maybeShowJSON $ JSONChunk [(desc, j)]
@ -422,7 +422,7 @@ transfer_list :: Stat
transfer_list = stat desc $ nojson $ lift $ do
uuidmap <- Remote.remoteMap id
ts <- getTransfers
maybeShowJSON $ JSONChunk [(desc, map (uncurry jsonify) ts)]
maybeShowJSON $ JSONChunk [(desc, V.fromList $ map (uncurry jsonify) ts)]
return $ if null ts
then "none"
else multiLine $
@ -438,11 +438,11 @@ transfer_list = stat desc $ nojson $ lift $ do
, maybe (fromUUID $ transferUUID t) Remote.name $
M.lookup (transferUUID t) uuidmap
]
jsonify t i = object $ map (\(k, v) -> (T.pack k, v)) $
[ ("transfer", toJSON (formatDirection (transferDirection t)))
, ("key", toJSON (key2file (transferKey t)))
, ("file", toJSON afile)
, ("remote", toJSON (fromUUID (transferUUID t)))
jsonify t i = object $ map (\(k, v) -> (packString k, v)) $
[ ("transfer", toJSON' (formatDirection (transferDirection t)))
, ("key", toJSON' (transferKey t))
, ("file", toJSON' afile)
, ("remote", toJSON' (fromUUID (transferUUID t)))
]
where
AssociatedFile afile = associatedFile i
@ -476,10 +476,13 @@ numcopies_stats :: Stat
numcopies_stats = stat "numcopies stats" $ json fmt $
calc <$> (maybe M.empty numCopiesVarianceMap <$> cachedNumCopiesStats)
where
calc = map (\(variance, count) -> (show variance, count))
calc = V.fromList
. map (\(variance, count) -> (show variance, count))
. sortBy (flip (comparing fst))
. M.toList
fmt = multiLine . map (\(variance, count) -> "numcopies " ++ variance ++ ": " ++ show count)
fmt = multiLine
. map (\(variance, count) -> "numcopies " ++ variance ++ ": " ++ show count)
. V.toList
reposizes_stats :: Stat
reposizes_stats = stat desc $ nojson $ do

View file

@ -14,12 +14,12 @@ import Logs.MetaData
import Annex.WorkTree
import Messages.JSON (JSONActionItem(..))
import Types.Messages
import Utility.Aeson
import qualified Data.Set as S
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.ByteString.Lazy.UTF8 as BU
import Data.Aeson
import Control.Concurrent
cmd :: Command
@ -115,7 +115,7 @@ perform c o k = case getSet o of
cleanup :: Key -> CommandCleanup
cleanup k = do
m <- getCurrentMetaData k
let Object o = toJSON (MetaDataFields m)
let Object o = toJSON' (MetaDataFields m)
maybeShowJSON $ AesonObject o
showLongNote $ unlines $ concatMap showmeta $
map unwrapmeta (fromMetaData m)
@ -129,8 +129,8 @@ cleanup k = do
newtype MetaDataFields = MetaDataFields MetaData
deriving (Show)
instance ToJSON MetaDataFields where
toJSON (MetaDataFields m) = object [ (fieldsField, toJSON m) ]
instance ToJSON' MetaDataFields where
toJSON' (MetaDataFields m) = object [ (fieldsField, toJSON' m) ]
instance FromJSON MetaDataFields where
parseJSON (Object v) = do

View file

@ -15,6 +15,7 @@ import Remote.Web (getWebUrls)
import Annex.UUID
import qualified Data.Map as M
import qualified Data.Vector as V
cmd :: Command
cmd = noCommit $ withGlobalOptions [jsonOptions, annexedMatchingOptions] $
@ -77,7 +78,7 @@ perform remotemap key = do
untrustedheader = "The following untrusted locations may also have copies:\n"
ppwhereis h ls urls = do
descm <- uuidDescriptions
let urlvals = map (\(u, us) -> (u, Just us)) $
let urlvals = map (\(u, us) -> (u, Just (V.fromList us))) $
filter (\(u,_) -> u `elem` ls) urls
prettyPrintUUIDsWith (Just "urls") h descm (const Nothing) urlvals

6
Key.hs
View file

@ -22,7 +22,6 @@ module Key (
prop_isomorphic_key_decode
) where
import Data.Aeson
import Data.Char
import qualified Data.Text as T
@ -30,6 +29,7 @@ import Common
import Types.Key
import Utility.QuickCheck
import Utility.Bloom
import Utility.Aeson
import qualified Utility.SimpleProtocol as Proto
stubKey :: Key
@ -155,8 +155,8 @@ instance Hashable Key where
hashIO32 = hashIO32 . key2file
hashIO64 = hashIO64 . key2file
instance ToJSON Key where
toJSON = toJSON . key2file
instance ToJSON' Key where
toJSON' = toJSON' . key2file
instance FromJSON Key where
parseJSON (String t) = maybe mempty pure $ file2key $ T.unpack t

View file

@ -26,12 +26,10 @@ module Messages.JSON (
JSONActionItem(..),
) where
import Data.Aeson
import Control.Applicative
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Vector as V
import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString.Lazy as L
import qualified Data.HashMap.Strict as HM
import System.IO
import System.IO.Unsafe (unsafePerformIO)
@ -44,6 +42,7 @@ import Types.Messages
import Key
import Utility.Metered
import Utility.Percentage
import Utility.Aeson
-- A global lock to avoid concurrent threads emitting json at the same time.
{-# NOINLINE emitLock #-}
@ -53,7 +52,7 @@ emitLock = unsafePerformIO $ newMVar ()
emit :: Object -> IO ()
emit o = do
takeMVar emitLock
B.hPut stdout (encode o)
L.hPut stdout (encode o)
putStr "\n"
putMVar emitLock ()
@ -67,7 +66,7 @@ none = id
start :: String -> Maybe FilePath -> Maybe Key -> JSONBuilder
start command file key _ = Just (o, False)
where
Object o = toJSON $ JSONActionItem
Object o = toJSON' $ JSONActionItem
{ itemCommand = Just command
, itemKey = key
, itemFile = file
@ -75,7 +74,7 @@ start command file key _ = Just (o, False)
}
end :: Bool -> JSONBuilder
end b (Just (o, _)) = Just (HM.insert "success" (toJSON b) o, True)
end b (Just (o, _)) = Just (HM.insert "success" (toJSON' b) o, True)
end _ Nothing = Nothing
finalize :: JSONOptions -> Object -> Object
@ -91,32 +90,32 @@ addErrorMessage msg o =
where
combinearray (Array new) (Array old) = Array (old <> new)
combinearray new _old = new
v = Array $ V.fromList $ map (String . T.pack) msg
v = Array $ V.fromList $ map (String . packString) msg
note :: String -> JSONBuilder
note _ Nothing = Nothing
note s (Just (o, e)) = Just (HM.insertWith combinelines "note" (toJSON s) o, e)
note s (Just (o, e)) = Just (HM.insertWith combinelines "note" (toJSON' s) o, e)
where
combinelines (String new) (String old) =
String (old <> T.pack "\n" <> new)
String (old <> "\n" <> new)
combinelines new _old = new
info :: String -> JSONBuilder
info s _ = Just (o, True)
where
Object o = object ["info" .= toJSON s]
Object o = object ["info" .= toJSON' s]
data JSONChunk v where
AesonObject :: Object -> JSONChunk Object
JSONChunk :: ToJSON v => [(String, v)] -> JSONChunk [(String, v)]
JSONChunk :: ToJSON' v => [(String, v)] -> JSONChunk [(String, v)]
add :: JSONChunk v -> JSONBuilder
add v (Just (o, e)) = Just (HM.union o' o, e)
where
Object o' = case v of
AesonObject ao -> Object ao
JSONChunk l -> object (map mkPair l)
mkPair (s, d) = (T.pack s, toJSON d)
JSONChunk l -> object $ map mkPair l
mkPair (s, d) = (packString s, toJSON' d)
add _ Nothing = Nothing
complete :: JSONChunk v -> JSONBuilder
@ -145,8 +144,8 @@ data DualDisp = DualDisp
, dispJson :: String
}
instance ToJSON DualDisp where
toJSON = toJSON . dispJson
instance ToJSON' DualDisp where
toJSON' = toJSON' . dispJson
instance Show DualDisp where
show = dispNormal
@ -156,10 +155,10 @@ instance Show DualDisp where
-- serialization of Map, which uses "[key, value]".
data ObjectMap a = ObjectMap { fromObjectMap :: M.Map String a }
instance ToJSON a => ToJSON (ObjectMap a) where
toJSON (ObjectMap m) = object $ map go $ M.toList m
instance ToJSON' a => ToJSON' (ObjectMap a) where
toJSON' (ObjectMap m) = object $ map go $ M.toList m
where
go (k, v) = (T.pack k, toJSON v)
go (k, v) = (packString k, toJSON' v)
-- An item that a git-annex command acts on, and displays a JSON object about.
data JSONActionItem a = JSONActionItem
@ -170,13 +169,13 @@ data JSONActionItem a = JSONActionItem
}
deriving (Show)
instance ToJSON (JSONActionItem a) where
toJSON i = object $ catMaybes
instance ToJSON' (JSONActionItem a) where
toJSON' i = object $ catMaybes
[ Just $ "command" .= itemCommand i
, case itemKey i of
Nothing -> Nothing
Just k -> Just $ "key" .= toJSON k
, Just $ "file" .= itemFile i
Just k -> Just $ "key" .= toJSON' k
, Just $ "file" .= toJSON' (itemFile i)
-- itemAdded is not included; must be added later by 'add'
]

View file

@ -57,9 +57,8 @@ module Remote (
) where
import Data.Ord
import Data.Aeson
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Vector as V
import Annex.Common
import Types.Remote
@ -74,6 +73,7 @@ import Config
import Config.DynamicConfig
import Git.Types (RemoteName)
import qualified Git
import Utility.Aeson
{- Map from UUIDs of Remotes to a calculated value. -}
remoteMap :: (Remote -> v) -> Annex (M.Map UUID v)
@ -197,7 +197,7 @@ prettyPrintUUIDsDescs header descm uuids =
{- An optional field can be included in the list of UUIDs. -}
prettyPrintUUIDsWith
:: ToJSON v
:: ToJSON' v
=> Maybe String
-> String
-> M.Map UUID RemoteName
@ -206,7 +206,7 @@ prettyPrintUUIDsWith
-> Annex String
prettyPrintUUIDsWith optfield header descm showval uuidvals = do
hereu <- getUUID
maybeShowJSON $ JSONChunk [(header, map (jsonify hereu) uuidvals)]
maybeShowJSON $ JSONChunk [(header, V.fromList $ map (jsonify hereu) uuidvals)]
return $ unwords $ map (\u -> "\t" ++ prettify hereu u ++ "\n") uuidvals
where
finddescription u = M.findWithDefault "" u descm
@ -224,11 +224,11 @@ prettyPrintUUIDsWith optfield header descm showval uuidvals = do
Nothing -> s
Just val -> val ++ ": " ++ s
jsonify hereu (u, optval) = object $ catMaybes
[ Just (T.pack "uuid", toJSON $ fromUUID u)
, Just (T.pack "description", toJSON $ finddescription u)
, Just (T.pack "here", toJSON $ hereu == u)
[ Just (packString "uuid", toJSON' $ fromUUID u)
, Just (packString "description", toJSON' $ finddescription u)
, Just (packString "here", toJSON' $ hereu == u)
, case (optfield, optval) of
(Just field, Just val) -> Just (T.pack field, toJSON val)
(Just field, Just val) -> Just (packString field, toJSON' val)
_ -> Nothing
]

View file

@ -23,7 +23,7 @@
module Remote.Tahoe (remote) where
import qualified Data.Map as M
import Data.Aeson
import Utility.Aeson
import Data.ByteString.Lazy.UTF8 (fromString)
import Control.Concurrent.STM

View file

@ -21,7 +21,6 @@ import Test.Tasty.Ingredients.Rerun
import Options.Applicative (switch, long, help, internal)
import qualified Data.Map as M
import qualified Data.Aeson
import qualified Data.ByteString.Lazy.UTF8 as BU8
import System.Environment
@ -83,6 +82,7 @@ import qualified Utility.HumanTime
import qualified Utility.Base64
import qualified Utility.Tmp.Dir
import qualified Utility.FileSystemEncoding
import qualified Utility.Aeson
#ifndef mingw32_HOST_OS
import qualified Remote.Helper.Encryptable
import qualified Types.Crypto
@ -971,7 +971,7 @@ test_merge = intmpclonerepo $
test_info :: Assertion
test_info = intmpclonerepo $ do
json <- BU8.fromString <$> git_annex_output "info" ["--json"]
case Data.Aeson.eitherDecode json :: Either String Data.Aeson.Value of
case Utility.Aeson.eitherDecode json :: Either String Utility.Aeson.Value of
Right _ -> return ()
Left e -> assertFailure e

View file

@ -9,7 +9,7 @@
module Types.Messages where
import qualified Data.Aeson as Aeson
import qualified Utility.Aeson as Aeson
import Control.Concurrent
#ifdef WITH_CONCURRENTOUTPUT

View file

@ -43,6 +43,7 @@ module Types.MetaData (
import Common
import Utility.Base64
import Utility.QuickCheck
import Utility.Aeson
import qualified Data.Text as T
import qualified Data.Set as S
@ -50,15 +51,14 @@ import qualified Data.Map as M
import qualified Data.HashMap.Strict as HM
import Data.Char
import qualified Data.CaseInsensitive as CI
import Data.Aeson
newtype MetaData = MetaData (M.Map MetaField (S.Set MetaValue))
deriving (Show, Eq, Ord)
instance ToJSON MetaData where
toJSON (MetaData m) = object $ map go (M.toList m)
instance ToJSON' MetaData where
toJSON' (MetaData m) = object $ map go (M.toList m)
where
go (MetaField f, s) = (T.pack (CI.original f), toJSON s)
go (MetaField f, s) = (packString (CI.original f), toJSON' s)
instance FromJSON MetaData where
parseJSON (Object o) = do
@ -82,8 +82,8 @@ newtype MetaField = MetaField (CI.CI String)
data MetaValue = MetaValue CurrentlySet String
deriving (Read, Show)
instance ToJSON MetaValue where
toJSON (MetaValue _ v) = toJSON v
instance ToJSON' MetaValue where
toJSON' (MetaValue _ v) = toJSON' v
instance FromJSON MetaValue where
parseJSON (String v) = return $ MetaValue (CurrentlySet True) (T.unpack v)

86
Utility/Aeson.hs Normal file
View file

@ -0,0 +1,86 @@
{- GHC File system encoding support for Aeson.
-
- Import instead of Data.Aeson
-
- Copyright 2018 Joey Hess <id@joeyh.name>
-
- License: BSD-2-clause
-}
{-# LANGUAGE FlexibleInstances, TypeSynonymInstances #-}
module Utility.Aeson (
module X,
ToJSON'(..),
encode,
packString,
) where
import Data.Aeson as X hiding (ToJSON, toJSON, encode)
import Data.Aeson hiding (encode)
import qualified Data.Aeson
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString as S
import qualified Data.Set
import qualified Data.Vector
import Prelude
import Utility.FileSystemEncoding
-- | Use this instead of Data.Aeson.encode to make sure that the
-- below String instance is used.
encode :: ToJSON' a => a -> L.ByteString
encode = Data.Aeson.encode . toJSON'
-- | Aeson has an unfortunate ToJSON instance for Char and [Char]
-- which does not support Strings containing UTF8 characters
-- encoded using the filesystem encoding when run in a non-utf8 locale.
--
-- Since we can't replace that with a instance that does the right
-- thing, instead here's a new class that handles String right.
class ToJSON' a where
toJSON' :: a -> Value
instance ToJSON' String where
toJSON' = toJSON . packString
-- | Pack a String to Text, correctly handling the filesystem encoding.
--
-- Use this instead of Data.Text.pack.
--
-- Note that if the string contains invalid UTF8 characters not using
-- the FileSystemEncoding, this is the same as Data.Text.pack.
packString :: String -> T.Text
packString s = case T.decodeUtf8' (S.concat $ L.toChunks $ encodeBS s) of
Right t -> t
Left _ -> T.pack s
-- | An instance for lists cannot be included as it would overlap with
-- the String instance. Instead, you can use a Vector.
instance ToJSON' s => ToJSON' (Data.Vector.Vector s) where
toJSON' = toJSON . map toJSON' . Data.Vector.toList
-- Aeson generates the same JSON for a Set as for a list.
instance ToJSON' s => ToJSON' (Data.Set.Set s) where
toJSON' = toJSON . map toJSON' . Data.Set.toList
instance (ToJSON' a, ToJSON a) => ToJSON' (Maybe a) where
toJSON' (Just a) = toJSON (Just (toJSON' a))
toJSON' v@Nothing = toJSON v
instance (ToJSON' a, ToJSON a, ToJSON' b, ToJSON b) => ToJSON' (a, b) where
toJSON' (a, b) = toJSON ((toJSON' a, toJSON' b))
instance ToJSON' Bool where
toJSON' = toJSON
instance ToJSON' Integer where
toJSON' = toJSON
instance ToJSON' Object where
toJSON' = toJSON
instance ToJSON' Value where
toJSON' = toJSON

View file

@ -2,6 +2,25 @@ json is defined as always utf-8. However, when LANG=C,
git-annex --json currently outputs "file":"<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>"
instead of "file":"äöü東" for that utf-8 filename. --[[Joey]]
(Note that git-annex can operate on non-utf8 filenames; it's not defined
what the json contains then, which might or might not be considered a bug
but this is not about that.)
This can also affect keys when they contain some non-utf8 from eg the
extension. And metadata keys and values can contain non-utf8 and also get
converted to json with similar results.
Note that git-annex can operate on non-utf8 filenames and keys;
it's not defined what the json contains then, and it currently contains
similar garbage.
This happens because aeson's instance of ToJSON for Char uses
Text.singleton, and Text does not handle ghc's filesystem encoding
for String. Instead it defaults to `\65533` for each byte encoded with the
filesystem encoding.
So, git-annex will need to convert filenames and keys and anything else
that might use the filesystem encoding to Text itself in some
way that does respect the filesystem encoding. Ie, use encodeBS to convert
it to a ByteString and then Data.Text.Encoding.decodeUtf8.
> [[done]] that. --[[Joey]]
What about git-annex commands that take json as input,
when run in a non-utf8 locale? Tested that, it is handled ok. --[[Joey]]

View file

@ -999,6 +999,7 @@ Executable git-annex
Upgrade.V3
Upgrade.V4
Upgrade.V5
Utility.Aeson
Utility.Applicative
Utility.AuthToken
Utility.Base64