decode_c converted to ByteString

This speeds up a few things, notably CmdLine.Seek using Git.Filename
which uses decode_c and this avoids a conversion to String and back,
and probably the ByteString implementation of decode_c is also faster
for simple cases at least than the string version.

encode_c cannot be converted to ByteString (or if it did, it would have
to convert right back to String in order to handle unicode).

Sponsored-by: Brock Spratlen on Patreon
This commit is contained in:
Joey Hess 2023-04-07 14:44:19 -04:00
parent f0b1034f8f
commit 371d4f8183
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
4 changed files with 57 additions and 36 deletions

View file

@ -26,8 +26,7 @@ decode b = case S.uncons b of
Nothing -> b
Just (i, l)
| l /= q -> b
| otherwise ->
encodeBS $ decode_c $ decodeBS i
| otherwise -> decode_c i
where
q :: Word8
q = fromIntegral (ord '"')

View file

@ -15,6 +15,7 @@ import Utility.Format
import Utility.Metered (TotalSize(..))
import Data.Char
import qualified Data.ByteString.Lazy as L
-- Sent to start a transfer.
data TransferRequest
@ -106,9 +107,9 @@ instance Proto.Sendable TransferResponse where
instance Proto.Receivable TransferResponse where
parseCommand "om" = Proto.parse1 $
TransferOutput . OutputMessage . encodeBS . decode_c
TransferOutput . OutputMessage . decode_c . encodeBS
parseCommand "oe" = Proto.parse1 $
TransferOutput . OutputError . decode_c
TransferOutput . OutputError . decodeBS . decode_c . encodeBS
parseCommand "opb" = Proto.parse0 $
TransferOutput BeginProgressMeter
parseCommand "ops" = Proto.parse1 $
@ -122,7 +123,7 @@ instance Proto.Receivable TransferResponse where
parseCommand "opre" = Proto.parse0 $
TransferOutput EndPrompt
parseCommand "oj" = Proto.parse1 $
TransferOutput . JSONObject . encodeBL . decode_c
TransferOutput . JSONObject . L.fromStrict . decode_c . encodeBS
parseCommand "t" = Proto.parse0 $
TransferResult True
parseCommand "f" = Proto.parse0 $
@ -143,7 +144,7 @@ instance Proto.Serializable TransferRemote where
serialize (TransferRemoteName r) = 'r':encode_c' isSpace r
deserialize ('u':u) = Just (TransferRemoteUUID (toUUID u))
deserialize ('r':r) = Just (TransferRemoteName (decode_c r))
deserialize ('r':r) = Just (TransferRemoteName (decodeBS (decode_c (encodeBS r))))
deserialize _ = Nothing
instance Proto.Serializable TransferAssociatedFile where
@ -156,4 +157,4 @@ instance Proto.Serializable TransferAssociatedFile where
deserialize "" = Just $ TransferAssociatedFile $
AssociatedFile Nothing
deserialize s = Just $ TransferAssociatedFile $
AssociatedFile $ Just $ toRawFilePath $ decode_c s
AssociatedFile $ Just $ decode_c $ encodeBS s

View file

@ -1,6 +1,6 @@
{- Formatted string handling.
-
- Copyright 2010-2020 Joey Hess <id@joeyh.name>
- Copyright 2010-2023 Joey Hess <id@joeyh.name>
-
- License: BSD-2-clause
-}
@ -23,10 +23,10 @@ import Data.Word (Word8)
import Data.List (isPrefixOf)
import qualified Codec.Binary.UTF8.String
import qualified Data.Map as M
import qualified Data.ByteString as S
import Utility.PartialPrelude
type FormatString = String
import Utility.FileSystemEncoding
{- A format consists of a list of fragments. -}
type Format = [Frag]
@ -69,8 +69,8 @@ format f vars = concatMap expand f
-
- Also, "${escaped_foo}" will apply encode_c to the value of variable foo.
-}
gen :: FormatString -> Format
gen = filter (not . empty) . fuse [] . scan [] . decode_c
gen :: String -> Format
gen = filter (not . empty) . fuse [] . scan [] . decodeBS . decode_c . encodeBS
where
-- The Format is built up in reverse, for efficiency,
-- and can have many adjacent Consts. Fusing it fixes both
@ -122,33 +122,50 @@ formatContainsVar v = any go
{- Decodes a C-style encoding, where \n is a newline (etc),
- \NNN is an octal encoded character, and \xNN is a hex encoded character.
-}
decode_c :: FormatString -> String
decode_c [] = []
decode_c s = unescape ("", s)
decode_c :: S.ByteString -> S.ByteString
decode_c s
| S.null s = S.empty
| otherwise = unescape (S.empty, s)
where
e = '\\'
unescape (b, []) = b
-- look for escapes starting with '\'
unescape (b, v) = b ++ fst pair ++ unescape (handle $ snd pair)
e = fromIntegral (ord '\\')
x = fromIntegral (ord 'x')
isescape c = c == e
unescape (b, v)
| S.null v = b
| otherwise = b <> fst pair <> unescape (handle $ snd pair)
where
pair = span (/= e) v
isescape x = x == e
handle (x:'x':n1:n2:rest)
| isescape x && allhex = (fromhex, rest)
pair = S.span (not . isescape) v
handle b
| S.length b >= 1 && isescape (S.index b 0) = handle' b
| otherwise = (S.empty, b)
handle' b
| S.length b >= 4
&& S.index b 1 == x
&& allhex = (fromhex, rest)
where
n1 = chr (fromIntegral (S.index b 2))
n2 = chr (fromIntegral (S.index b 3))
rest = S.drop 4 b
allhex = isHexDigit n1 && isHexDigit n2
fromhex = [chr $ readhex [n1, n2]]
fromhex = encodeBS [chr $ readhex [n1, n2]]
readhex h = Prelude.read $ "0x" ++ h :: Int
handle (x:n1:n2:n3:rest)
| isescape x && alloctal = (fromoctal, rest)
handle' b
| S.length b >= 4 && alloctal = (fromoctal, rest)
where
n1 = chr (fromIntegral (S.index b 1))
n2 = chr (fromIntegral (S.index b 2))
n3 = chr (fromIntegral (S.index b 3))
rest = S.drop 4 b
alloctal = isOctDigit n1 && isOctDigit n2 && isOctDigit n3
fromoctal = [chr $ readoctal [n1, n2, n3]]
fromoctal = encodeBS [chr $ readoctal [n1, n2, n3]]
readoctal o = Prelude.read $ "0o" ++ o :: Int
-- \C is used for a few special characters
handle (x:nc:rest)
| isescape x = ([echar nc], rest)
handle' b
| S.length b >= 2 =
(S.singleton (fromIntegral (ord (echar nc))), rest)
where
nc = chr (fromIntegral (S.index b 1))
rest = S.drop 2 b
echar 'a' = '\a'
echar 'b' = '\b'
echar 'f' = '\f'
@ -157,14 +174,18 @@ decode_c s = unescape ("", s)
echar 't' = '\t'
echar 'v' = '\v'
echar a = a
handle n = ("", n)
handle' b = (S.empty, b)
{- Inverse of decode_c. -}
encode_c :: String -> FormatString
{- Inverse of decode_c.
-
- Note that this operates on String, not ByteString, which is important in
- order to be able to handle unicode characters, which get encoded in
- octal. -}
encode_c :: String -> String
encode_c = encode_c' (const False)
{- Encodes special characters, as well as any matching the predicate. -}
encode_c' :: (Char -> Bool) -> String -> FormatString
encode_c' :: (Char -> Bool) -> String -> String
encode_c' p = concatMap echar
where
e c = '\\' : [c]
@ -198,6 +219,6 @@ encode_c' p = concatMap echar
- This property papers over the problem, by only testing ascii.
-}
prop_encode_c_decode_c_roundtrip :: String -> Bool
prop_encode_c_decode_c_roundtrip s = s' == decode_c (encode_c s')
prop_encode_c_decode_c_roundtrip s = s' == decodeBS (decode_c (encodeBS (encode_c s')))
where
s' = filter isAscii s

View file

@ -248,7 +248,7 @@ secretKeys cmd = catchDefaultIO M.empty makemap
-- If the userid contains a ":" or a few other special
-- characters, gpg will hex-escape it. Use decode_c to
-- undo.
extract ((keyid, decode_c userid):c) Nothing rest
extract ((keyid, decodeBS (decode_c (encodeBS userid))):c) Nothing rest
extract c (Just keyid) rest@(("sec":_):_) =
extract ((keyid, ""):c) Nothing rest
extract c (Just keyid) (_:rest) =