diff --git a/Git/Filename.hs b/Git/Filename.hs index 18888c6996..322054fd80 100644 --- a/Git/Filename.hs +++ b/Git/Filename.hs @@ -1,15 +1,17 @@ {- Some git commands output encoded filenames, in a rather annoyingly complex - C-style encoding. - - - Copyright 2010, 2011 Joey Hess + - Copyright 2010-2023 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} +{-# LANGUAGE OverloadedStrings #-} + module Git.Filename where import Common -import Utility.Format (decode_c, encode_c) +import Utility.Format (decode_c, encode_c, isUtf8Byte) import Utility.QuickCheck import Data.Char @@ -31,9 +33,11 @@ decode b = case S.uncons b of q :: Word8 q = fromIntegral (ord '"') -{- Should not need to use this, except for testing decode. -} -encode :: RawFilePath -> S.ByteString -encode s = encodeBS $ "\"" ++ encode_c (decodeBS s) ++ "\"" +-- always encodes and double quotes, even in cases that git does not +encodeAlways :: RawFilePath -> S.ByteString +encodeAlways s = "\"" <> encode_c needencode s <> "\"" + where + needencode c = isUtf8Byte c || c == fromIntegral (ord '"') -- Encoding and then decoding roundtrips only when the string does not -- contain high unicode, because eg, both "\12345" and "\227\128\185" @@ -43,6 +47,6 @@ encode s = encodeBS $ "\"" ++ encode_c (decodeBS s) ++ "\"" -- limits what's tested to ascii, so avoids running into it. prop_encode_decode_roundtrip :: TestableFilePath -> Bool prop_encode_decode_roundtrip ts = - s == fromRawFilePath (decode (encode (toRawFilePath s))) + s == fromRawFilePath (decode (encodeAlways (toRawFilePath s))) where s = fromTestableFilePath ts diff --git a/Types/Transferrer.hs b/Types/Transferrer.hs index 1ce98e7cce..7cdfd10f36 100644 --- a/Types/Transferrer.hs +++ b/Types/Transferrer.hs @@ -83,9 +83,9 @@ instance Proto.Receivable TransferRequest where instance Proto.Sendable TransferResponse where formatMessage (TransferOutput (OutputMessage m)) = - ["om", Proto.serialize (encode_c (decodeBS m))] + ["om", Proto.serialize (decodeBS (encode_c isUtf8Byte m))] formatMessage (TransferOutput (OutputError e)) = - ["oe", Proto.serialize (encode_c e)] + ["oe", Proto.serialize (decodeBS (encode_c isUtf8Byte (encodeBS e)))] formatMessage (TransferOutput BeginProgressMeter) = ["opb"] formatMessage (TransferOutput (UpdateProgressMeterTotalSize (TotalSize sz))) = @@ -99,7 +99,7 @@ instance Proto.Sendable TransferResponse where formatMessage (TransferOutput EndPrompt) = ["opre"] formatMessage (TransferOutput (JSONObject b)) = - ["oj", Proto.serialize (encode_c (decodeBL b))] + ["oj", Proto.serialize (decodeBS (encode_c isUtf8Byte (L.toStrict b)))] formatMessage (TransferResult True) = ["t"] formatMessage (TransferResult False) = @@ -141,7 +141,9 @@ instance Proto.Serializable TransferRemote where serialize (TransferRemoteUUID u) = 'u':fromUUID u -- A remote name could contain whitespace or newlines, which needs -- to be escaped for the protocol. Use C-style encoding. - serialize (TransferRemoteName r) = 'r':encode_c' isSpace r + serialize (TransferRemoteName r) = 'r':decodeBS (encode_c is_space_or_unicode (encodeBS r)) + where + is_space_or_unicode c = isUtf8Byte c || isSpace (chr (fromIntegral c)) deserialize ('u':u) = Just (TransferRemoteUUID (toUUID u)) deserialize ('r':r) = Just (TransferRemoteName (decodeBS (decode_c (encodeBS r)))) @@ -151,7 +153,7 @@ instance Proto.Serializable TransferAssociatedFile where -- Comes last, so whitespace is ok. But, in case the filename -- contains eg a newline, escape it. Use C-style encoding. serialize (TransferAssociatedFile (AssociatedFile (Just f))) = - encode_c (fromRawFilePath f) + decodeBS (encode_c isUtf8Byte f) serialize (TransferAssociatedFile (AssociatedFile Nothing)) = "" deserialize "" = Just $ TransferAssociatedFile $ diff --git a/Utility/Format.hs b/Utility/Format.hs index 04db3fe024..50d57312d0 100644 --- a/Utility/Format.hs +++ b/Utility/Format.hs @@ -13,6 +13,7 @@ module Utility.Format ( decode_c, encode_c, encode_c', + isUtf8Byte, prop_encode_c_decode_c_roundtrip ) where @@ -21,7 +22,6 @@ import Data.Char (isAlphaNum, isOctDigit, isHexDigit, isSpace, chr, ord, isAscii import Data.Maybe (fromMaybe) import Data.Word (Word8) import Data.List (isPrefixOf) -import qualified Codec.Binary.UTF8.String import qualified Data.Map as M import qualified Data.ByteString as S @@ -53,7 +53,8 @@ format f vars = concatMap expand f where expand (Const s) = s expand (Var name j esc) - | esc = justify j $ encode_c' isSpace $ getvar name + | esc = justify j $ decodeBS $ encode_c needescape $ + encodeBS $ getvar name | otherwise = justify j $ getvar name getvar name = fromMaybe "" $ M.lookup name vars justify UnJustified s = s @@ -61,6 +62,9 @@ format f vars = concatMap expand f justify (RightJustified i) s = pad i s ++ s pad i s = take (i - length s) spaces spaces = repeat ' ' + needescape c = isUtf8Byte c || + isSpace (chr (fromIntegral c)) || + c == fromIntegral (ord '"') {- Generates a Format that can be used to expand variables in a - format string, such as "${foo} ${bar;10} ${baz;-10}\n" @@ -173,42 +177,52 @@ decode_c s echar 'r' = '\r' echar 't' = '\t' echar 'v' = '\v' - echar a = a + echar a = a -- \\ decodes to '\', and \" to '"' handle' b = (S.empty, b) -{- Inverse of decode_c. - - - - Note that this operates on String, not ByteString, which is important in - - order to be able to handle unicode characters, which get encoded in - - octal. -} -encode_c :: String -> String -encode_c = encode_c' (const False) +{- Inverse of decode_c. Encodes ascii control characters as well as + - bytes that match the predicate. (And also '\' itself.) + -} +encode_c :: (Word8 -> Bool) -> S.ByteString -> S.ByteString +encode_c p s = case encode_c' p s of + Just s' -> s' + Nothing -> s -{- Encodes special characters, as well as any matching the predicate. -} -encode_c' :: (Char -> Bool) -> String -> String -encode_c' p = concatMap echar +{- Returns Nothing when nothing needs to be escaped in the input ByteString. -} +encode_c' :: (Word8 -> Bool) -> S.ByteString -> Maybe S.ByteString +encode_c' p s + | S.any needencode s = Just (S.concatMap echar s) + | otherwise = Nothing where - e c = '\\' : [c] - echar '\a' = e 'a' - echar '\b' = e 'b' - echar '\f' = e 'f' - echar '\n' = e 'n' - echar '\r' = e 'r' - echar '\t' = e 't' - echar '\v' = e 'v' - echar '\\' = e '\\' - echar '"' = e '"' + needencode c = iscontrol c || c == del || c == e || p c + + e = fromIntegral (ord '\\') + q = fromIntegral (ord '"') + del = 0x7F + iscontrol c = c < 0x20 + + ec c = S.pack [e, fromIntegral (ord c)] + + echar 0x7 = ec 'a' + echar 0x8 = ec 'b' + echar 0x0C = ec 'f' + echar 0x0A = ec 'n' + echar 0x0D = ec 'r' + echar 0x09 = ec 't' + echar 0x0B = ec 'v' echar c - | ord c < 0x20 = e_asc c -- low ascii - | ord c >= 256 = e_utf c -- unicode - | ord c > 0x7E = e_asc c -- high ascii - | p c = e_asc c - | otherwise = [c] - -- unicode character is decomposed to individual Word8s, - -- and each is shown in octal - e_utf c = showoctal =<< (Codec.Binary.UTF8.String.encode [c] :: [Word8]) - e_asc c = showoctal $ ord c - showoctal i = '\\' : printf "%03o" i + | c == e = ec '\\' -- escape the escape character itself + | iscontrol c = showoctal c + | c == del = showoctal c + | p c = if c == q + then ec '"' -- escape double quote + else showoctal c + | otherwise = S.singleton c + + showoctal i = encodeBS ('\\' : printf "%03o" i) + +isUtf8Byte :: Word8 -> Bool +isUtf8Byte c = c >= 0x80 {- For quickcheck. - @@ -219,6 +233,7 @@ encode_c' p = concatMap echar - This property papers over the problem, by only testing ascii. -} prop_encode_c_decode_c_roundtrip :: String -> Bool -prop_encode_c_decode_c_roundtrip s = s' == decodeBS (decode_c (encodeBS (encode_c s'))) +prop_encode_c_decode_c_roundtrip s = s' == + decodeBS (decode_c (encode_c isUtf8Byte (encodeBS s'))) where s' = filter isAscii s