convert encode_c to ByteString

This turns out to be possible after all, because the old one decomposed a unicode Char to multiple Word8s and encoded those. It should be faster in some places, particularly in Git.Filename.encodeAlways. The old version encoded all unicode by default as well as ascii control characters and also '"'. The new one only encodes ascii control characters by default. That old behavior was visible in Utility.Format.format, which did escape '"' when used in eg git-annex find --format='${escaped_file}\n' So made sure to keep that working the same. Although the man page only says it will escape "unusual" characters, so it might be able to be changed. Git.Filename.encodeAlways also needs to escape '"' ; that was the original reason that was escaped. Types.Transferrer I judge is ok to not escape '"', because the escaped value is sent in a line-based protocol, which is decoded at the other end by decode_c. So old git-annex and new will be fine whether that is escaped or not, the result will be the same. Note that when asked to escape a double quote, it is escaped to \" rather than to \042. That's the same behavior as git has. It's perhaps somehow more of a special case than it needs to be. Sponsored-by: k0ld on Patreon
2023-04-07 16:47:26 -04:00 · 2023-04-07 16:47:26 -04:00 · d9b6be7782
commit d9b6be7782
parent 371d4f8183
3 changed files with 66 additions and 45 deletions
--- a/Git/Filename.hs
+++ b/Git/Filename.hs
@ -1,15 +1,17 @@
 {- Some git commands output encoded filenames, in a rather annoyingly complex
 - C-style encoding.
 -
- - Copyright 2010, 2011 Joey Hess <id@joeyh.name>
+ - Copyright 2010-2023 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU AGPL version 3 or higher.
 -}

+{-# LANGUAGE OverloadedStrings #-}
+
 module Git.Filename where

 import Common
-import Utility.Format (decode_c, encode_c)
+import Utility.Format (decode_c, encode_c, isUtf8Byte)
 import Utility.QuickCheck

 import Data.Char
@ -31,9 +33,11 @@ decode b = case S.uncons b of
  	q :: Word8
 	q = fromIntegral (ord '"')

-{- Should not need to use this, except for testing decode. -}
-encode :: RawFilePath -> S.ByteString
-encode s = encodeBS $ "\"" ++ encode_c (decodeBS s) ++ "\""
+-- always encodes and double quotes, even in cases that git does not
+encodeAlways :: RawFilePath -> S.ByteString
+encodeAlways s = "\"" <> encode_c needencode s <> "\""
+  where
+	needencode c = isUtf8Byte c || c == fromIntegral (ord '"')

 -- Encoding and then decoding roundtrips only when the string does not
 -- contain high unicode, because eg,  both "\12345" and "\227\128\185"
@ -43,6 +47,6 @@ encode s = encodeBS $ "\"" ++ encode_c (decodeBS s) ++ "\""
 -- limits what's tested to ascii, so avoids running into it.
 prop_encode_decode_roundtrip :: TestableFilePath -> Bool
 prop_encode_decode_roundtrip ts = 
-	s == fromRawFilePath (decode (encode (toRawFilePath s)))
+	s == fromRawFilePath (decode (encodeAlways (toRawFilePath s)))
  where
 	s = fromTestableFilePath ts
--- a/Types/Transferrer.hs
+++ b/Types/Transferrer.hs
@ -83,9 +83,9 @@ instance Proto.Receivable TransferRequest where

 instance Proto.Sendable TransferResponse where
 	formatMessage (TransferOutput (OutputMessage m)) =
-		["om", Proto.serialize (encode_c (decodeBS m))]
+		["om", Proto.serialize (decodeBS (encode_c isUtf8Byte m))]
 	formatMessage (TransferOutput (OutputError e)) =
-		["oe", Proto.serialize (encode_c e)]
+		["oe", Proto.serialize (decodeBS (encode_c isUtf8Byte (encodeBS e)))]
 	formatMessage (TransferOutput BeginProgressMeter) =
 		["opb"]
 	formatMessage (TransferOutput (UpdateProgressMeterTotalSize (TotalSize sz))) =
@ -99,7 +99,7 @@ instance Proto.Sendable TransferResponse where
 	formatMessage (TransferOutput EndPrompt) =
 		["opre"]
 	formatMessage (TransferOutput (JSONObject b)) =
-		["oj", Proto.serialize (encode_c (decodeBL b))]
+		["oj", Proto.serialize (decodeBS (encode_c isUtf8Byte (L.toStrict b)))]
 	formatMessage (TransferResult True) =
 		["t"]
 	formatMessage (TransferResult False) =
@ -141,7 +141,9 @@ instance Proto.Serializable TransferRemote where
 	serialize (TransferRemoteUUID u) = 'u':fromUUID u
 	-- A remote name could contain whitespace or newlines, which needs
 	-- to be escaped for the protocol. Use C-style encoding.
-	serialize (TransferRemoteName r) = 'r':encode_c' isSpace r
+	serialize (TransferRemoteName r) = 'r':decodeBS (encode_c is_space_or_unicode (encodeBS r))
+	  where
+		is_space_or_unicode c = isUtf8Byte c || isSpace (chr (fromIntegral c))

 	deserialize ('u':u) = Just (TransferRemoteUUID (toUUID u))
 	deserialize ('r':r) = Just (TransferRemoteName (decodeBS (decode_c (encodeBS r))))
@ -151,7 +153,7 @@ instance Proto.Serializable TransferAssociatedFile where
 	-- Comes last, so whitespace is ok. But, in case the filename
 	-- contains eg a newline, escape it. Use C-style encoding.
 	serialize (TransferAssociatedFile (AssociatedFile (Just f))) =
-		encode_c (fromRawFilePath f)
+		decodeBS (encode_c isUtf8Byte f)
 	serialize (TransferAssociatedFile (AssociatedFile Nothing)) = ""

 	deserialize "" = Just $ TransferAssociatedFile $
--- a/Utility/Format.hs
+++ b/Utility/Format.hs
@ -13,6 +13,7 @@ module Utility.Format (
 	decode_c,
 	encode_c,
 	encode_c',
+	isUtf8Byte,
 	prop_encode_c_decode_c_roundtrip
 ) where

@ -21,7 +22,6 @@ import Data.Char (isAlphaNum, isOctDigit, isHexDigit, isSpace, chr, ord, isAscii
 import Data.Maybe (fromMaybe)
 import Data.Word (Word8)
 import Data.List (isPrefixOf)
-import qualified Codec.Binary.UTF8.String
 import qualified Data.Map as M
 import qualified Data.ByteString as S

@ -53,7 +53,8 @@ format f vars = concatMap expand f
  where
 	expand (Const s) = s
 	expand (Var name j esc)
-		| esc = justify j $ encode_c' isSpace $ getvar name
+		| esc = justify j $ decodeBS $ encode_c needescape $
+			encodeBS $ getvar name
 		| otherwise = justify j $ getvar name
 	getvar name = fromMaybe "" $ M.lookup name vars
 	justify UnJustified s        = s
@ -61,6 +62,9 @@ format f vars = concatMap expand f
 	justify (RightJustified i) s = pad i s ++ s
 	pad i s = take (i - length s) spaces
 	spaces = repeat ' '
+	needescape c = isUtf8Byte c ||
+		isSpace (chr (fromIntegral c)) ||
+		c == fromIntegral (ord '"')

 {- Generates a Format that can be used to expand variables in a
 - format string, such as "${foo} ${bar;10} ${baz;-10}\n"
@ -173,42 +177,52 @@ decode_c s
 		echar 'r' = '\r'
 		echar 't' = '\t'
 		echar 'v' = '\v'
-		echar a = a
+		echar a = a -- \\ decodes to '\', and \" to '"'
 	handle' b = (S.empty, b)

-{- Inverse of decode_c. 
- -
- - Note that this operates on String, not ByteString, which is important in
- - order to be able to handle unicode characters, which get encoded in
- - octal. -}
-encode_c :: String -> String
-encode_c = encode_c' (const False)
+{- Inverse of decode_c. Encodes ascii control characters as well as
+ - bytes that match the predicate. (And also '\' itself.)
+ -}
+encode_c :: (Word8 -> Bool) -> S.ByteString -> S.ByteString
+encode_c p s = case encode_c' p s of
+	Just s' -> s'
+	Nothing -> s

-{- Encodes special characters, as well as any matching the predicate. -}
-encode_c' :: (Char -> Bool) -> String -> String
-encode_c' p = concatMap echar
+{- Returns Nothing when nothing needs to be escaped in the input ByteString. -}
+encode_c' :: (Word8 -> Bool) -> S.ByteString -> Maybe S.ByteString
+encode_c' p s
+	| S.any needencode s = Just (S.concatMap echar s)
+	| otherwise = Nothing
  where
-	e c = '\\' : [c]
-	echar '\a' = e 'a'
-	echar '\b' = e 'b'
-	echar '\f' = e 'f'
-	echar '\n' = e 'n'
-	echar '\r' = e 'r'
-	echar '\t' = e 't'
-	echar '\v' = e 'v'
-	echar '\\' = e '\\'
-	echar '"'  = e '"'
+	needencode c = iscontrol c || c == del || c == e || p c
+
+	e = fromIntegral (ord '\\')
+	q = fromIntegral (ord '"')
+	del = 0x7F
+	iscontrol c = c < 0x20
+
+	ec c = S.pack [e, fromIntegral (ord c)]
+	
+	echar 0x7 = ec 'a'
+	echar 0x8 = ec 'b'
+	echar 0x0C = ec 'f'
+	echar 0x0A = ec 'n'
+	echar 0x0D = ec 'r'
+	echar 0x09 = ec 't'
+	echar 0x0B = ec 'v'
 	echar c
-		| ord c < 0x20 = e_asc c -- low ascii
-		| ord c >= 256 = e_utf c -- unicode
-		| ord c > 0x7E = e_asc c -- high ascii
-		| p c          = e_asc c
-		| otherwise    = [c]
-	-- unicode character is decomposed to individual Word8s,
-	-- and each is shown in octal
-	e_utf c = showoctal =<< (Codec.Binary.UTF8.String.encode [c] :: [Word8])
-	e_asc c = showoctal $ ord c
-	showoctal i = '\\' : printf "%03o" i
+		| c == e = ec '\\' -- escape the escape character itself
+		| iscontrol c = showoctal c 
+		| c == del = showoctal c
+		| p c = if c == q
+			then ec '"' -- escape double quote
+			else showoctal c
+		| otherwise = S.singleton c
+
+	showoctal i = encodeBS ('\\' : printf "%03o" i)
+
+isUtf8Byte :: Word8 -> Bool
+isUtf8Byte c = c >= 0x80

 {- For quickcheck. 
 -
@ -219,6 +233,7 @@ encode_c' p = concatMap echar
 - This property papers over the problem, by only testing ascii.
 -}
 prop_encode_c_decode_c_roundtrip :: String -> Bool
-prop_encode_c_decode_c_roundtrip s = s' == decodeBS (decode_c (encodeBS (encode_c s')))
+prop_encode_c_decode_c_roundtrip s = s' == 
+	decodeBS (decode_c (encode_c isUtf8Byte (encodeBS s')))
  where
 	s' = filter isAscii s