convert encode_c to ByteString

This turns out to be possible after all, because the old one decomposed
a unicode Char to multiple Word8s and encoded those. It should be faster
in some places, particularly in Git.Filename.encodeAlways.

The old version encoded all unicode by default as well as ascii control
characters and also '"'. The new one only encodes ascii control
characters by default.

That old behavior was visible in Utility.Format.format, which did escape
'"' when used in eg git-annex find --format='${escaped_file}\n'
So made sure to keep that working the same. Although the man page only
says it will escape "unusual" characters, so it might be able to be
changed.

Git.Filename.encodeAlways also needs to escape '"' ; that was the
original reason that was escaped.

Types.Transferrer I judge is ok to not escape '"', because the escaped
value is sent in a line-based protocol, which is decoded at the other
end by decode_c. So old git-annex and new will be fine whether that is
escaped or not, the result will be the same.

Note that when asked to escape a double quote, it is escaped to \"
rather than to \042. That's the same behavior as git has. It's
perhaps somehow more of a special case than it needs to be.

Sponsored-by: k0ld on Patreon
This commit is contained in:
Joey Hess 2023-04-07 16:47:26 -04:00
parent 371d4f8183
commit d9b6be7782
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 66 additions and 45 deletions

View file

@ -1,15 +1,17 @@
{- Some git commands output encoded filenames, in a rather annoyingly complex {- Some git commands output encoded filenames, in a rather annoyingly complex
- C-style encoding. - C-style encoding.
- -
- Copyright 2010, 2011 Joey Hess <id@joeyh.name> - Copyright 2010-2023 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
{-# LANGUAGE OverloadedStrings #-}
module Git.Filename where module Git.Filename where
import Common import Common
import Utility.Format (decode_c, encode_c) import Utility.Format (decode_c, encode_c, isUtf8Byte)
import Utility.QuickCheck import Utility.QuickCheck
import Data.Char import Data.Char
@ -31,9 +33,11 @@ decode b = case S.uncons b of
q :: Word8 q :: Word8
q = fromIntegral (ord '"') q = fromIntegral (ord '"')
{- Should not need to use this, except for testing decode. -} -- always encodes and double quotes, even in cases that git does not
encode :: RawFilePath -> S.ByteString encodeAlways :: RawFilePath -> S.ByteString
encode s = encodeBS $ "\"" ++ encode_c (decodeBS s) ++ "\"" encodeAlways s = "\"" <> encode_c needencode s <> "\""
where
needencode c = isUtf8Byte c || c == fromIntegral (ord '"')
-- Encoding and then decoding roundtrips only when the string does not -- Encoding and then decoding roundtrips only when the string does not
-- contain high unicode, because eg, both "\12345" and "\227\128\185" -- contain high unicode, because eg, both "\12345" and "\227\128\185"
@ -43,6 +47,6 @@ encode s = encodeBS $ "\"" ++ encode_c (decodeBS s) ++ "\""
-- limits what's tested to ascii, so avoids running into it. -- limits what's tested to ascii, so avoids running into it.
prop_encode_decode_roundtrip :: TestableFilePath -> Bool prop_encode_decode_roundtrip :: TestableFilePath -> Bool
prop_encode_decode_roundtrip ts = prop_encode_decode_roundtrip ts =
s == fromRawFilePath (decode (encode (toRawFilePath s))) s == fromRawFilePath (decode (encodeAlways (toRawFilePath s)))
where where
s = fromTestableFilePath ts s = fromTestableFilePath ts

View file

@ -83,9 +83,9 @@ instance Proto.Receivable TransferRequest where
instance Proto.Sendable TransferResponse where instance Proto.Sendable TransferResponse where
formatMessage (TransferOutput (OutputMessage m)) = formatMessage (TransferOutput (OutputMessage m)) =
["om", Proto.serialize (encode_c (decodeBS m))] ["om", Proto.serialize (decodeBS (encode_c isUtf8Byte m))]
formatMessage (TransferOutput (OutputError e)) = formatMessage (TransferOutput (OutputError e)) =
["oe", Proto.serialize (encode_c e)] ["oe", Proto.serialize (decodeBS (encode_c isUtf8Byte (encodeBS e)))]
formatMessage (TransferOutput BeginProgressMeter) = formatMessage (TransferOutput BeginProgressMeter) =
["opb"] ["opb"]
formatMessage (TransferOutput (UpdateProgressMeterTotalSize (TotalSize sz))) = formatMessage (TransferOutput (UpdateProgressMeterTotalSize (TotalSize sz))) =
@ -99,7 +99,7 @@ instance Proto.Sendable TransferResponse where
formatMessage (TransferOutput EndPrompt) = formatMessage (TransferOutput EndPrompt) =
["opre"] ["opre"]
formatMessage (TransferOutput (JSONObject b)) = formatMessage (TransferOutput (JSONObject b)) =
["oj", Proto.serialize (encode_c (decodeBL b))] ["oj", Proto.serialize (decodeBS (encode_c isUtf8Byte (L.toStrict b)))]
formatMessage (TransferResult True) = formatMessage (TransferResult True) =
["t"] ["t"]
formatMessage (TransferResult False) = formatMessage (TransferResult False) =
@ -141,7 +141,9 @@ instance Proto.Serializable TransferRemote where
serialize (TransferRemoteUUID u) = 'u':fromUUID u serialize (TransferRemoteUUID u) = 'u':fromUUID u
-- A remote name could contain whitespace or newlines, which needs -- A remote name could contain whitespace or newlines, which needs
-- to be escaped for the protocol. Use C-style encoding. -- to be escaped for the protocol. Use C-style encoding.
serialize (TransferRemoteName r) = 'r':encode_c' isSpace r serialize (TransferRemoteName r) = 'r':decodeBS (encode_c is_space_or_unicode (encodeBS r))
where
is_space_or_unicode c = isUtf8Byte c || isSpace (chr (fromIntegral c))
deserialize ('u':u) = Just (TransferRemoteUUID (toUUID u)) deserialize ('u':u) = Just (TransferRemoteUUID (toUUID u))
deserialize ('r':r) = Just (TransferRemoteName (decodeBS (decode_c (encodeBS r)))) deserialize ('r':r) = Just (TransferRemoteName (decodeBS (decode_c (encodeBS r))))
@ -151,7 +153,7 @@ instance Proto.Serializable TransferAssociatedFile where
-- Comes last, so whitespace is ok. But, in case the filename -- Comes last, so whitespace is ok. But, in case the filename
-- contains eg a newline, escape it. Use C-style encoding. -- contains eg a newline, escape it. Use C-style encoding.
serialize (TransferAssociatedFile (AssociatedFile (Just f))) = serialize (TransferAssociatedFile (AssociatedFile (Just f))) =
encode_c (fromRawFilePath f) decodeBS (encode_c isUtf8Byte f)
serialize (TransferAssociatedFile (AssociatedFile Nothing)) = "" serialize (TransferAssociatedFile (AssociatedFile Nothing)) = ""
deserialize "" = Just $ TransferAssociatedFile $ deserialize "" = Just $ TransferAssociatedFile $

View file

@ -13,6 +13,7 @@ module Utility.Format (
decode_c, decode_c,
encode_c, encode_c,
encode_c', encode_c',
isUtf8Byte,
prop_encode_c_decode_c_roundtrip prop_encode_c_decode_c_roundtrip
) where ) where
@ -21,7 +22,6 @@ import Data.Char (isAlphaNum, isOctDigit, isHexDigit, isSpace, chr, ord, isAscii
import Data.Maybe (fromMaybe) import Data.Maybe (fromMaybe)
import Data.Word (Word8) import Data.Word (Word8)
import Data.List (isPrefixOf) import Data.List (isPrefixOf)
import qualified Codec.Binary.UTF8.String
import qualified Data.Map as M import qualified Data.Map as M
import qualified Data.ByteString as S import qualified Data.ByteString as S
@ -53,7 +53,8 @@ format f vars = concatMap expand f
where where
expand (Const s) = s expand (Const s) = s
expand (Var name j esc) expand (Var name j esc)
| esc = justify j $ encode_c' isSpace $ getvar name | esc = justify j $ decodeBS $ encode_c needescape $
encodeBS $ getvar name
| otherwise = justify j $ getvar name | otherwise = justify j $ getvar name
getvar name = fromMaybe "" $ M.lookup name vars getvar name = fromMaybe "" $ M.lookup name vars
justify UnJustified s = s justify UnJustified s = s
@ -61,6 +62,9 @@ format f vars = concatMap expand f
justify (RightJustified i) s = pad i s ++ s justify (RightJustified i) s = pad i s ++ s
pad i s = take (i - length s) spaces pad i s = take (i - length s) spaces
spaces = repeat ' ' spaces = repeat ' '
needescape c = isUtf8Byte c ||
isSpace (chr (fromIntegral c)) ||
c == fromIntegral (ord '"')
{- Generates a Format that can be used to expand variables in a {- Generates a Format that can be used to expand variables in a
- format string, such as "${foo} ${bar;10} ${baz;-10}\n" - format string, such as "${foo} ${bar;10} ${baz;-10}\n"
@ -173,42 +177,52 @@ decode_c s
echar 'r' = '\r' echar 'r' = '\r'
echar 't' = '\t' echar 't' = '\t'
echar 'v' = '\v' echar 'v' = '\v'
echar a = a echar a = a -- \\ decodes to '\', and \" to '"'
handle' b = (S.empty, b) handle' b = (S.empty, b)
{- Inverse of decode_c. {- Inverse of decode_c. Encodes ascii control characters as well as
- - bytes that match the predicate. (And also '\' itself.)
- Note that this operates on String, not ByteString, which is important in -}
- order to be able to handle unicode characters, which get encoded in encode_c :: (Word8 -> Bool) -> S.ByteString -> S.ByteString
- octal. -} encode_c p s = case encode_c' p s of
encode_c :: String -> String Just s' -> s'
encode_c = encode_c' (const False) Nothing -> s
{- Encodes special characters, as well as any matching the predicate. -} {- Returns Nothing when nothing needs to be escaped in the input ByteString. -}
encode_c' :: (Char -> Bool) -> String -> String encode_c' :: (Word8 -> Bool) -> S.ByteString -> Maybe S.ByteString
encode_c' p = concatMap echar encode_c' p s
| S.any needencode s = Just (S.concatMap echar s)
| otherwise = Nothing
where where
e c = '\\' : [c] needencode c = iscontrol c || c == del || c == e || p c
echar '\a' = e 'a'
echar '\b' = e 'b' e = fromIntegral (ord '\\')
echar '\f' = e 'f' q = fromIntegral (ord '"')
echar '\n' = e 'n' del = 0x7F
echar '\r' = e 'r' iscontrol c = c < 0x20
echar '\t' = e 't'
echar '\v' = e 'v' ec c = S.pack [e, fromIntegral (ord c)]
echar '\\' = e '\\'
echar '"' = e '"' echar 0x7 = ec 'a'
echar 0x8 = ec 'b'
echar 0x0C = ec 'f'
echar 0x0A = ec 'n'
echar 0x0D = ec 'r'
echar 0x09 = ec 't'
echar 0x0B = ec 'v'
echar c echar c
| ord c < 0x20 = e_asc c -- low ascii | c == e = ec '\\' -- escape the escape character itself
| ord c >= 256 = e_utf c -- unicode | iscontrol c = showoctal c
| ord c > 0x7E = e_asc c -- high ascii | c == del = showoctal c
| p c = e_asc c | p c = if c == q
| otherwise = [c] then ec '"' -- escape double quote
-- unicode character is decomposed to individual Word8s, else showoctal c
-- and each is shown in octal | otherwise = S.singleton c
e_utf c = showoctal =<< (Codec.Binary.UTF8.String.encode [c] :: [Word8])
e_asc c = showoctal $ ord c showoctal i = encodeBS ('\\' : printf "%03o" i)
showoctal i = '\\' : printf "%03o" i
isUtf8Byte :: Word8 -> Bool
isUtf8Byte c = c >= 0x80
{- For quickcheck. {- For quickcheck.
- -
@ -219,6 +233,7 @@ encode_c' p = concatMap echar
- This property papers over the problem, by only testing ascii. - This property papers over the problem, by only testing ascii.
-} -}
prop_encode_c_decode_c_roundtrip :: String -> Bool prop_encode_c_decode_c_roundtrip :: String -> Bool
prop_encode_c_decode_c_roundtrip s = s' == decodeBS (decode_c (encodeBS (encode_c s'))) prop_encode_c_decode_c_roundtrip s = s' ==
decodeBS (decode_c (encode_c isUtf8Byte (encodeBS s')))
where where
s' = filter isAscii s s' = filter isAscii s