convert encode_c to ByteString

This turns out to be possible after all, because the old one decomposed
a unicode Char to multiple Word8s and encoded those. It should be faster
in some places, particularly in Git.Filename.encodeAlways.

The old version encoded all unicode by default as well as ascii control
characters and also '"'. The new one only encodes ascii control
characters by default.

That old behavior was visible in Utility.Format.format, which did escape
'"' when used in eg git-annex find --format='${escaped_file}\n'
So made sure to keep that working the same. Although the man page only
says it will escape "unusual" characters, so it might be able to be
changed.

Git.Filename.encodeAlways also needs to escape '"' ; that was the
original reason that was escaped.

Types.Transferrer I judge is ok to not escape '"', because the escaped
value is sent in a line-based protocol, which is decoded at the other
end by decode_c. So old git-annex and new will be fine whether that is
escaped or not, the result will be the same.

Note that when asked to escape a double quote, it is escaped to \"
rather than to \042. That's the same behavior as git has. It's
perhaps somehow more of a special case than it needs to be.

Sponsored-by: k0ld on Patreon
This commit is contained in:
Joey Hess 2023-04-07 16:47:26 -04:00
parent 371d4f8183
commit d9b6be7782
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 66 additions and 45 deletions

View file

@ -1,15 +1,17 @@
{- Some git commands output encoded filenames, in a rather annoyingly complex
- C-style encoding.
-
- Copyright 2010, 2011 Joey Hess <id@joeyh.name>
- Copyright 2010-2023 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE OverloadedStrings #-}
module Git.Filename where
import Common
import Utility.Format (decode_c, encode_c)
import Utility.Format (decode_c, encode_c, isUtf8Byte)
import Utility.QuickCheck
import Data.Char
@ -31,9 +33,11 @@ decode b = case S.uncons b of
q :: Word8
q = fromIntegral (ord '"')
{- Should not need to use this, except for testing decode. -}
encode :: RawFilePath -> S.ByteString
encode s = encodeBS $ "\"" ++ encode_c (decodeBS s) ++ "\""
-- always encodes and double quotes, even in cases that git does not
encodeAlways :: RawFilePath -> S.ByteString
encodeAlways s = "\"" <> encode_c needencode s <> "\""
where
needencode c = isUtf8Byte c || c == fromIntegral (ord '"')
-- Encoding and then decoding roundtrips only when the string does not
-- contain high unicode, because eg, both "\12345" and "\227\128\185"
@ -43,6 +47,6 @@ encode s = encodeBS $ "\"" ++ encode_c (decodeBS s) ++ "\""
-- limits what's tested to ascii, so avoids running into it.
prop_encode_decode_roundtrip :: TestableFilePath -> Bool
prop_encode_decode_roundtrip ts =
s == fromRawFilePath (decode (encode (toRawFilePath s)))
s == fromRawFilePath (decode (encodeAlways (toRawFilePath s)))
where
s = fromTestableFilePath ts

View file

@ -83,9 +83,9 @@ instance Proto.Receivable TransferRequest where
instance Proto.Sendable TransferResponse where
formatMessage (TransferOutput (OutputMessage m)) =
["om", Proto.serialize (encode_c (decodeBS m))]
["om", Proto.serialize (decodeBS (encode_c isUtf8Byte m))]
formatMessage (TransferOutput (OutputError e)) =
["oe", Proto.serialize (encode_c e)]
["oe", Proto.serialize (decodeBS (encode_c isUtf8Byte (encodeBS e)))]
formatMessage (TransferOutput BeginProgressMeter) =
["opb"]
formatMessage (TransferOutput (UpdateProgressMeterTotalSize (TotalSize sz))) =
@ -99,7 +99,7 @@ instance Proto.Sendable TransferResponse where
formatMessage (TransferOutput EndPrompt) =
["opre"]
formatMessage (TransferOutput (JSONObject b)) =
["oj", Proto.serialize (encode_c (decodeBL b))]
["oj", Proto.serialize (decodeBS (encode_c isUtf8Byte (L.toStrict b)))]
formatMessage (TransferResult True) =
["t"]
formatMessage (TransferResult False) =
@ -141,7 +141,9 @@ instance Proto.Serializable TransferRemote where
serialize (TransferRemoteUUID u) = 'u':fromUUID u
-- A remote name could contain whitespace or newlines, which needs
-- to be escaped for the protocol. Use C-style encoding.
serialize (TransferRemoteName r) = 'r':encode_c' isSpace r
serialize (TransferRemoteName r) = 'r':decodeBS (encode_c is_space_or_unicode (encodeBS r))
where
is_space_or_unicode c = isUtf8Byte c || isSpace (chr (fromIntegral c))
deserialize ('u':u) = Just (TransferRemoteUUID (toUUID u))
deserialize ('r':r) = Just (TransferRemoteName (decodeBS (decode_c (encodeBS r))))
@ -151,7 +153,7 @@ instance Proto.Serializable TransferAssociatedFile where
-- Comes last, so whitespace is ok. But, in case the filename
-- contains eg a newline, escape it. Use C-style encoding.
serialize (TransferAssociatedFile (AssociatedFile (Just f))) =
encode_c (fromRawFilePath f)
decodeBS (encode_c isUtf8Byte f)
serialize (TransferAssociatedFile (AssociatedFile Nothing)) = ""
deserialize "" = Just $ TransferAssociatedFile $

View file

@ -13,6 +13,7 @@ module Utility.Format (
decode_c,
encode_c,
encode_c',
isUtf8Byte,
prop_encode_c_decode_c_roundtrip
) where
@ -21,7 +22,6 @@ import Data.Char (isAlphaNum, isOctDigit, isHexDigit, isSpace, chr, ord, isAscii
import Data.Maybe (fromMaybe)
import Data.Word (Word8)
import Data.List (isPrefixOf)
import qualified Codec.Binary.UTF8.String
import qualified Data.Map as M
import qualified Data.ByteString as S
@ -53,7 +53,8 @@ format f vars = concatMap expand f
where
expand (Const s) = s
expand (Var name j esc)
| esc = justify j $ encode_c' isSpace $ getvar name
| esc = justify j $ decodeBS $ encode_c needescape $
encodeBS $ getvar name
| otherwise = justify j $ getvar name
getvar name = fromMaybe "" $ M.lookup name vars
justify UnJustified s = s
@ -61,6 +62,9 @@ format f vars = concatMap expand f
justify (RightJustified i) s = pad i s ++ s
pad i s = take (i - length s) spaces
spaces = repeat ' '
needescape c = isUtf8Byte c ||
isSpace (chr (fromIntegral c)) ||
c == fromIntegral (ord '"')
{- Generates a Format that can be used to expand variables in a
- format string, such as "${foo} ${bar;10} ${baz;-10}\n"
@ -173,42 +177,52 @@ decode_c s
echar 'r' = '\r'
echar 't' = '\t'
echar 'v' = '\v'
echar a = a
echar a = a -- \\ decodes to '\', and \" to '"'
handle' b = (S.empty, b)
{- Inverse of decode_c.
-
- Note that this operates on String, not ByteString, which is important in
- order to be able to handle unicode characters, which get encoded in
- octal. -}
encode_c :: String -> String
encode_c = encode_c' (const False)
{- Inverse of decode_c. Encodes ascii control characters as well as
- bytes that match the predicate. (And also '\' itself.)
-}
encode_c :: (Word8 -> Bool) -> S.ByteString -> S.ByteString
encode_c p s = case encode_c' p s of
Just s' -> s'
Nothing -> s
{- Encodes special characters, as well as any matching the predicate. -}
encode_c' :: (Char -> Bool) -> String -> String
encode_c' p = concatMap echar
{- Returns Nothing when nothing needs to be escaped in the input ByteString. -}
encode_c' :: (Word8 -> Bool) -> S.ByteString -> Maybe S.ByteString
encode_c' p s
| S.any needencode s = Just (S.concatMap echar s)
| otherwise = Nothing
where
e c = '\\' : [c]
echar '\a' = e 'a'
echar '\b' = e 'b'
echar '\f' = e 'f'
echar '\n' = e 'n'
echar '\r' = e 'r'
echar '\t' = e 't'
echar '\v' = e 'v'
echar '\\' = e '\\'
echar '"' = e '"'
needencode c = iscontrol c || c == del || c == e || p c
e = fromIntegral (ord '\\')
q = fromIntegral (ord '"')
del = 0x7F
iscontrol c = c < 0x20
ec c = S.pack [e, fromIntegral (ord c)]
echar 0x7 = ec 'a'
echar 0x8 = ec 'b'
echar 0x0C = ec 'f'
echar 0x0A = ec 'n'
echar 0x0D = ec 'r'
echar 0x09 = ec 't'
echar 0x0B = ec 'v'
echar c
| ord c < 0x20 = e_asc c -- low ascii
| ord c >= 256 = e_utf c -- unicode
| ord c > 0x7E = e_asc c -- high ascii
| p c = e_asc c
| otherwise = [c]
-- unicode character is decomposed to individual Word8s,
-- and each is shown in octal
e_utf c = showoctal =<< (Codec.Binary.UTF8.String.encode [c] :: [Word8])
e_asc c = showoctal $ ord c
showoctal i = '\\' : printf "%03o" i
| c == e = ec '\\' -- escape the escape character itself
| iscontrol c = showoctal c
| c == del = showoctal c
| p c = if c == q
then ec '"' -- escape double quote
else showoctal c
| otherwise = S.singleton c
showoctal i = encodeBS ('\\' : printf "%03o" i)
isUtf8Byte :: Word8 -> Bool
isUtf8Byte c = c >= 0x80
{- For quickcheck.
-
@ -219,6 +233,7 @@ encode_c' p = concatMap echar
- This property papers over the problem, by only testing ascii.
-}
prop_encode_c_decode_c_roundtrip :: String -> Bool
prop_encode_c_decode_c_roundtrip s = s' == decodeBS (decode_c (encodeBS (encode_c s')))
prop_encode_c_decode_c_roundtrip s = s' ==
decodeBS (decode_c (encode_c isUtf8Byte (encodeBS s')))
where
s' = filter isAscii s