convert encode_c to ByteString

This turns out to be possible after all, because the old one decomposed
a unicode Char to multiple Word8s and encoded those. It should be faster
in some places, particularly in Git.Filename.encodeAlways.

The old version encoded all unicode by default as well as ascii control
characters and also '"'. The new one only encodes ascii control
characters by default.

That old behavior was visible in Utility.Format.format, which did escape
'"' when used in eg git-annex find --format='${escaped_file}\n'
So made sure to keep that working the same. Although the man page only
says it will escape "unusual" characters, so it might be able to be
changed.

Git.Filename.encodeAlways also needs to escape '"' ; that was the
original reason that was escaped.

Types.Transferrer I judge is ok to not escape '"', because the escaped
value is sent in a line-based protocol, which is decoded at the other
end by decode_c. So old git-annex and new will be fine whether that is
escaped or not, the result will be the same.

Note that when asked to escape a double quote, it is escaped to \"
rather than to \042. That's the same behavior as git has. It's
perhaps somehow more of a special case than it needs to be.

Sponsored-by: k0ld on Patreon
This commit is contained in:
Joey Hess 2023-04-07 16:47:26 -04:00
parent 371d4f8183
commit d9b6be7782
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 66 additions and 45 deletions

View file

@ -13,6 +13,7 @@ module Utility.Format (
decode_c,
encode_c,
encode_c',
isUtf8Byte,
prop_encode_c_decode_c_roundtrip
) where
@ -21,7 +22,6 @@ import Data.Char (isAlphaNum, isOctDigit, isHexDigit, isSpace, chr, ord, isAscii
import Data.Maybe (fromMaybe)
import Data.Word (Word8)
import Data.List (isPrefixOf)
import qualified Codec.Binary.UTF8.String
import qualified Data.Map as M
import qualified Data.ByteString as S
@ -53,7 +53,8 @@ format f vars = concatMap expand f
where
expand (Const s) = s
expand (Var name j esc)
| esc = justify j $ encode_c' isSpace $ getvar name
| esc = justify j $ decodeBS $ encode_c needescape $
encodeBS $ getvar name
| otherwise = justify j $ getvar name
getvar name = fromMaybe "" $ M.lookup name vars
justify UnJustified s = s
@ -61,6 +62,9 @@ format f vars = concatMap expand f
justify (RightJustified i) s = pad i s ++ s
pad i s = take (i - length s) spaces
spaces = repeat ' '
needescape c = isUtf8Byte c ||
isSpace (chr (fromIntegral c)) ||
c == fromIntegral (ord '"')
{- Generates a Format that can be used to expand variables in a
- format string, such as "${foo} ${bar;10} ${baz;-10}\n"
@ -173,42 +177,52 @@ decode_c s
echar 'r' = '\r'
echar 't' = '\t'
echar 'v' = '\v'
echar a = a
echar a = a -- \\ decodes to '\', and \" to '"'
handle' b = (S.empty, b)
{- Inverse of decode_c.
-
- Note that this operates on String, not ByteString, which is important in
- order to be able to handle unicode characters, which get encoded in
- octal. -}
encode_c :: String -> String
encode_c = encode_c' (const False)
{- Inverse of decode_c. Encodes ascii control characters as well as
- bytes that match the predicate. (And also '\' itself.)
-}
encode_c :: (Word8 -> Bool) -> S.ByteString -> S.ByteString
encode_c p s = case encode_c' p s of
Just s' -> s'
Nothing -> s
{- Encodes special characters, as well as any matching the predicate. -}
encode_c' :: (Char -> Bool) -> String -> String
encode_c' p = concatMap echar
{- Returns Nothing when nothing needs to be escaped in the input ByteString. -}
encode_c' :: (Word8 -> Bool) -> S.ByteString -> Maybe S.ByteString
encode_c' p s
| S.any needencode s = Just (S.concatMap echar s)
| otherwise = Nothing
where
e c = '\\' : [c]
echar '\a' = e 'a'
echar '\b' = e 'b'
echar '\f' = e 'f'
echar '\n' = e 'n'
echar '\r' = e 'r'
echar '\t' = e 't'
echar '\v' = e 'v'
echar '\\' = e '\\'
echar '"' = e '"'
needencode c = iscontrol c || c == del || c == e || p c
e = fromIntegral (ord '\\')
q = fromIntegral (ord '"')
del = 0x7F
iscontrol c = c < 0x20
ec c = S.pack [e, fromIntegral (ord c)]
echar 0x7 = ec 'a'
echar 0x8 = ec 'b'
echar 0x0C = ec 'f'
echar 0x0A = ec 'n'
echar 0x0D = ec 'r'
echar 0x09 = ec 't'
echar 0x0B = ec 'v'
echar c
| ord c < 0x20 = e_asc c -- low ascii
| ord c >= 256 = e_utf c -- unicode
| ord c > 0x7E = e_asc c -- high ascii
| p c = e_asc c
| otherwise = [c]
-- unicode character is decomposed to individual Word8s,
-- and each is shown in octal
e_utf c = showoctal =<< (Codec.Binary.UTF8.String.encode [c] :: [Word8])
e_asc c = showoctal $ ord c
showoctal i = '\\' : printf "%03o" i
| c == e = ec '\\' -- escape the escape character itself
| iscontrol c = showoctal c
| c == del = showoctal c
| p c = if c == q
then ec '"' -- escape double quote
else showoctal c
| otherwise = S.singleton c
showoctal i = encodeBS ('\\' : printf "%03o" i)
isUtf8Byte :: Word8 -> Bool
isUtf8Byte c = c >= 0x80
{- For quickcheck.
-
@ -219,6 +233,7 @@ encode_c' p = concatMap echar
- This property papers over the problem, by only testing ascii.
-}
prop_encode_c_decode_c_roundtrip :: String -> Bool
prop_encode_c_decode_c_roundtrip s = s' == decodeBS (decode_c (encodeBS (encode_c s')))
prop_encode_c_decode_c_roundtrip s = s' ==
decodeBS (decode_c (encode_c isUtf8Byte (encodeBS s')))
where
s' = filter isAscii s