2011-12-22 21:59:14 +00:00
|
|
|
{- Formatted string handling.
|
|
|
|
-
|
2023-04-07 18:44:19 +00:00
|
|
|
- Copyright 2010-2023 Joey Hess <id@joeyh.name>
|
2011-12-22 21:59:14 +00:00
|
|
|
-
|
2014-05-10 14:01:27 +00:00
|
|
|
- License: BSD-2-clause
|
2011-12-22 21:59:14 +00:00
|
|
|
-}
|
|
|
|
|
2011-12-23 00:14:35 +00:00
|
|
|
module Utility.Format (
|
|
|
|
Format,
|
|
|
|
gen,
|
|
|
|
format,
|
2023-04-11 18:57:09 +00:00
|
|
|
escapedFormat,
|
2020-05-19 19:35:00 +00:00
|
|
|
formatContainsVar,
|
2011-12-23 00:14:35 +00:00
|
|
|
decode_c,
|
|
|
|
encode_c,
|
2020-12-09 19:28:45 +00:00
|
|
|
encode_c',
|
2023-04-07 20:47:26 +00:00
|
|
|
isUtf8Byte,
|
fix failing quickcheck properties
QuickCheck 2.10 found a counterexample eg "\929184" broke the property.
As far as I can tell, Git.Filename is matching how git handles encoding
of strange high unicode characters in filenames for display. Git does
not display high unicode characters, and instead displays the C-style
escaped form of each byte. This is ambiguous, but since git is not
unicode aware, it doesn't need to roundtrip parse it.
So, making Git.FileName's roundtrip test only chars < 256 seems fine.
Utility.Format.format uses encode_c, in order to mimic git, so that's
ok.
Utility.Format.gen uses decode_c, but only so that stuff like "\n"
in the format string is handled. If the format string contains C-style
octal escapes, they will be converted to ascii characters, and not
combined into unicode characters, but that should not be a problem.
If the user wants unicode characters, they can include them in the
format string, without escaping them.
Finally, decode_c is used by Utility.Gpg.secretKeys, because gpg
--with-colons hex-escapes some characters in particular ':' and '\\'.
gpg passes unicode through, so this use of decode_c is not a problem.
This commit was sponsored by Henrik Riomar on Patreon.
2017-06-17 20:17:09 +00:00
|
|
|
prop_encode_c_decode_c_roundtrip
|
2011-12-23 00:14:35 +00:00
|
|
|
) where
|
2011-12-22 21:59:14 +00:00
|
|
|
|
|
|
|
import Text.Printf (printf)
|
2019-12-30 17:54:46 +00:00
|
|
|
import Data.Char (isAlphaNum, isOctDigit, isHexDigit, isSpace, chr, ord, isAscii)
|
2011-12-23 00:14:35 +00:00
|
|
|
import Data.Maybe (fromMaybe)
|
|
|
|
import Data.Word (Word8)
|
2011-12-23 01:23:11 +00:00
|
|
|
import Data.List (isPrefixOf)
|
2011-12-22 21:59:14 +00:00
|
|
|
import qualified Data.Map as M
|
2023-04-07 18:44:19 +00:00
|
|
|
import qualified Data.ByteString as S
|
2011-12-22 21:59:14 +00:00
|
|
|
|
|
|
|
import Utility.PartialPrelude
|
2023-04-07 18:44:19 +00:00
|
|
|
import Utility.FileSystemEncoding
|
2011-12-22 21:59:14 +00:00
|
|
|
|
2011-12-22 23:56:31 +00:00
|
|
|
{- A format consists of a list of fragments. -}
|
|
|
|
type Format = [Frag]
|
2011-12-22 21:59:14 +00:00
|
|
|
|
2020-05-19 19:35:00 +00:00
|
|
|
{- A fragment is either a constant string, or a variable. -}
|
|
|
|
data Frag
|
|
|
|
= Const String
|
|
|
|
| Var
|
|
|
|
{ varName :: String
|
|
|
|
, varJustify :: Justify
|
|
|
|
, varEscaped :: Bool
|
|
|
|
}
|
2011-12-22 21:59:14 +00:00
|
|
|
deriving (Show)
|
|
|
|
|
2011-12-23 04:36:25 +00:00
|
|
|
data Justify = LeftJustified Int | RightJustified Int | UnJustified
|
|
|
|
deriving (Show)
|
2011-12-22 23:56:31 +00:00
|
|
|
|
2012-01-21 06:24:12 +00:00
|
|
|
type Variables = M.Map String String
|
|
|
|
|
2011-12-22 21:59:14 +00:00
|
|
|
{- Expands a Format using some variables, generating a formatted string.
|
|
|
|
- This can be repeatedly called, efficiently. -}
|
2012-01-21 06:24:12 +00:00
|
|
|
format :: Format -> Variables -> String
|
2011-12-22 23:56:31 +00:00
|
|
|
format f vars = concatMap expand f
|
2012-12-13 04:24:19 +00:00
|
|
|
where
|
|
|
|
expand (Const s) = s
|
2020-05-19 19:35:00 +00:00
|
|
|
expand (Var name j esc)
|
2023-04-11 18:57:09 +00:00
|
|
|
| esc = justify j $ decodeBS $ escapedFormat $
|
2023-04-07 20:47:26 +00:00
|
|
|
encodeBS $ getvar name
|
2012-12-13 04:24:19 +00:00
|
|
|
| otherwise = justify j $ getvar name
|
|
|
|
getvar name = fromMaybe "" $ M.lookup name vars
|
|
|
|
justify UnJustified s = s
|
|
|
|
justify (LeftJustified i) s = s ++ pad i s
|
|
|
|
justify (RightJustified i) s = pad i s ++ s
|
|
|
|
pad i s = take (i - length s) spaces
|
|
|
|
spaces = repeat ' '
|
2023-04-11 18:57:09 +00:00
|
|
|
|
|
|
|
escapedFormat :: S.ByteString -> S.ByteString
|
|
|
|
escapedFormat = encode_c needescape
|
|
|
|
where
|
2023-04-07 20:47:26 +00:00
|
|
|
needescape c = isUtf8Byte c ||
|
|
|
|
isSpace (chr (fromIntegral c)) ||
|
|
|
|
c == fromIntegral (ord '"')
|
2011-12-22 21:59:14 +00:00
|
|
|
|
|
|
|
{- Generates a Format that can be used to expand variables in a
|
2011-12-22 23:56:31 +00:00
|
|
|
- format string, such as "${foo} ${bar;10} ${baz;-10}\n"
|
2011-12-22 21:59:14 +00:00
|
|
|
-
|
|
|
|
- (This is the same type of format string used by dpkg-query.)
|
2020-05-19 19:35:00 +00:00
|
|
|
-
|
|
|
|
- Also, "${escaped_foo}" will apply encode_c to the value of variable foo.
|
2011-12-22 21:59:14 +00:00
|
|
|
-}
|
2023-04-07 18:44:19 +00:00
|
|
|
gen :: String -> Format
|
|
|
|
gen = filter (not . empty) . fuse [] . scan [] . decodeBS . decode_c . encodeBS
|
2012-12-13 04:24:19 +00:00
|
|
|
where
|
|
|
|
-- The Format is built up in reverse, for efficiency,
|
|
|
|
-- and can have many adjacent Consts. Fusing it fixes both
|
|
|
|
-- problems.
|
|
|
|
fuse f [] = f
|
|
|
|
fuse f (Const c1:Const c2:vs) = fuse f $ Const (c2++c1) : vs
|
|
|
|
fuse f (v:vs) = fuse (v:f) vs
|
|
|
|
|
|
|
|
scan f (a:b:cs)
|
|
|
|
| a == '$' && b == '{' = invar f [] cs
|
|
|
|
| otherwise = scan (Const [a] : f ) (b:cs)
|
|
|
|
scan f v = Const v : f
|
|
|
|
|
|
|
|
invar f var [] = Const (novar var) : f
|
|
|
|
invar f var (c:cs)
|
|
|
|
| c == '}' = foundvar f var UnJustified cs
|
|
|
|
| isAlphaNum c || c == '_' = invar f (c:var) cs
|
|
|
|
| c == ';' = inpad "" f var cs
|
|
|
|
| otherwise = scan ((Const $ novar $ c:var):f) cs
|
|
|
|
|
|
|
|
inpad p f var (c:cs)
|
|
|
|
| c == '}' = foundvar f var (readjustify $ reverse p) cs
|
|
|
|
| otherwise = inpad (c:p) f var cs
|
|
|
|
inpad p f var [] = Const (novar $ p++";"++var) : f
|
|
|
|
readjustify = getjustify . fromMaybe 0 . readish
|
|
|
|
getjustify i
|
|
|
|
| i == 0 = UnJustified
|
|
|
|
| i < 0 = LeftJustified (-1 * i)
|
|
|
|
| otherwise = RightJustified i
|
|
|
|
novar v = "${" ++ reverse v
|
2020-05-19 19:35:00 +00:00
|
|
|
foundvar f varname_r p =
|
|
|
|
let varname = reverse varname_r
|
|
|
|
var = if "escaped_" `isPrefixOf` varname
|
|
|
|
then Var (drop (length "escaped_") varname) p True
|
|
|
|
else Var varname p False
|
|
|
|
in scan (var : f)
|
2011-12-23 00:14:35 +00:00
|
|
|
|
2011-12-23 00:21:42 +00:00
|
|
|
empty :: Frag -> Bool
|
|
|
|
empty (Const "") = True
|
|
|
|
empty _ = False
|
2011-12-23 00:14:35 +00:00
|
|
|
|
2020-05-19 19:35:00 +00:00
|
|
|
{- Check if a Format contains a variable with a specified name. -}
|
|
|
|
formatContainsVar :: String -> Format -> Bool
|
|
|
|
formatContainsVar v = any go
|
|
|
|
where
|
|
|
|
go (Var v' _ _) | v' == v = True
|
|
|
|
go _ = False
|
|
|
|
|
fix failing quickcheck properties
QuickCheck 2.10 found a counterexample eg "\929184" broke the property.
As far as I can tell, Git.Filename is matching how git handles encoding
of strange high unicode characters in filenames for display. Git does
not display high unicode characters, and instead displays the C-style
escaped form of each byte. This is ambiguous, but since git is not
unicode aware, it doesn't need to roundtrip parse it.
So, making Git.FileName's roundtrip test only chars < 256 seems fine.
Utility.Format.format uses encode_c, in order to mimic git, so that's
ok.
Utility.Format.gen uses decode_c, but only so that stuff like "\n"
in the format string is handled. If the format string contains C-style
octal escapes, they will be converted to ascii characters, and not
combined into unicode characters, but that should not be a problem.
If the user wants unicode characters, they can include them in the
format string, without escaping them.
Finally, decode_c is used by Utility.Gpg.secretKeys, because gpg
--with-colons hex-escapes some characters in particular ':' and '\\'.
gpg passes unicode through, so this use of decode_c is not a problem.
This commit was sponsored by Henrik Riomar on Patreon.
2017-06-17 20:17:09 +00:00
|
|
|
{- Decodes a C-style encoding, where \n is a newline (etc),
|
|
|
|
- \NNN is an octal encoded character, and \xNN is a hex encoded character.
|
2011-12-23 00:14:35 +00:00
|
|
|
-}
|
2023-04-07 18:44:19 +00:00
|
|
|
decode_c :: S.ByteString -> S.ByteString
|
|
|
|
decode_c s
|
|
|
|
| S.null s = S.empty
|
|
|
|
| otherwise = unescape (S.empty, s)
|
2012-12-13 04:24:19 +00:00
|
|
|
where
|
2023-04-07 18:44:19 +00:00
|
|
|
e = fromIntegral (ord '\\')
|
|
|
|
x = fromIntegral (ord 'x')
|
|
|
|
isescape c = c == e
|
|
|
|
unescape (b, v)
|
|
|
|
| S.null v = b
|
|
|
|
| otherwise = b <> fst pair <> unescape (handle $ snd pair)
|
2012-12-13 04:24:19 +00:00
|
|
|
where
|
2023-04-07 18:44:19 +00:00
|
|
|
pair = S.span (not . isescape) v
|
|
|
|
handle b
|
|
|
|
| S.length b >= 1 && isescape (S.index b 0) = handle' b
|
|
|
|
| otherwise = (S.empty, b)
|
|
|
|
|
|
|
|
handle' b
|
|
|
|
| S.length b >= 4
|
|
|
|
&& S.index b 1 == x
|
|
|
|
&& allhex = (fromhex, rest)
|
2013-09-16 16:57:39 +00:00
|
|
|
where
|
2023-04-07 18:44:19 +00:00
|
|
|
n1 = chr (fromIntegral (S.index b 2))
|
|
|
|
n2 = chr (fromIntegral (S.index b 3))
|
|
|
|
rest = S.drop 4 b
|
2014-10-09 18:53:13 +00:00
|
|
|
allhex = isHexDigit n1 && isHexDigit n2
|
2023-04-07 18:44:19 +00:00
|
|
|
fromhex = encodeBS [chr $ readhex [n1, n2]]
|
2013-09-16 16:57:39 +00:00
|
|
|
readhex h = Prelude.read $ "0x" ++ h :: Int
|
2023-04-07 18:44:19 +00:00
|
|
|
handle' b
|
|
|
|
| S.length b >= 4 && alloctal = (fromoctal, rest)
|
2012-12-13 04:24:19 +00:00
|
|
|
where
|
2023-04-07 18:44:19 +00:00
|
|
|
n1 = chr (fromIntegral (S.index b 1))
|
|
|
|
n2 = chr (fromIntegral (S.index b 2))
|
|
|
|
n3 = chr (fromIntegral (S.index b 3))
|
|
|
|
rest = S.drop 4 b
|
2012-12-13 04:24:19 +00:00
|
|
|
alloctal = isOctDigit n1 && isOctDigit n2 && isOctDigit n3
|
2023-04-07 18:44:19 +00:00
|
|
|
fromoctal = encodeBS [chr $ readoctal [n1, n2, n3]]
|
2012-12-13 04:24:19 +00:00
|
|
|
readoctal o = Prelude.read $ "0o" ++ o :: Int
|
2023-04-07 18:44:19 +00:00
|
|
|
handle' b
|
|
|
|
| S.length b >= 2 =
|
|
|
|
(S.singleton (fromIntegral (ord (echar nc))), rest)
|
2012-12-13 04:24:19 +00:00
|
|
|
where
|
2023-04-07 18:44:19 +00:00
|
|
|
nc = chr (fromIntegral (S.index b 1))
|
|
|
|
rest = S.drop 2 b
|
2012-12-13 04:24:19 +00:00
|
|
|
echar 'a' = '\a'
|
|
|
|
echar 'b' = '\b'
|
|
|
|
echar 'f' = '\f'
|
|
|
|
echar 'n' = '\n'
|
|
|
|
echar 'r' = '\r'
|
|
|
|
echar 't' = '\t'
|
|
|
|
echar 'v' = '\v'
|
2023-04-07 20:47:26 +00:00
|
|
|
echar a = a -- \\ decodes to '\', and \" to '"'
|
2023-04-07 18:44:19 +00:00
|
|
|
handle' b = (S.empty, b)
|
2011-12-23 00:14:35 +00:00
|
|
|
|
2023-04-07 20:47:26 +00:00
|
|
|
{- Inverse of decode_c. Encodes ascii control characters as well as
|
|
|
|
- bytes that match the predicate. (And also '\' itself.)
|
|
|
|
-}
|
|
|
|
encode_c :: (Word8 -> Bool) -> S.ByteString -> S.ByteString
|
2023-04-07 21:20:58 +00:00
|
|
|
encode_c p s = fromMaybe s (encode_c' p s)
|
2023-04-07 20:47:26 +00:00
|
|
|
|
|
|
|
{- Returns Nothing when nothing needs to be escaped in the input ByteString. -}
|
|
|
|
encode_c' :: (Word8 -> Bool) -> S.ByteString -> Maybe S.ByteString
|
|
|
|
encode_c' p s
|
|
|
|
| S.any needencode s = Just (S.concatMap echar s)
|
|
|
|
| otherwise = Nothing
|
2012-12-13 04:24:19 +00:00
|
|
|
where
|
2023-04-07 20:47:26 +00:00
|
|
|
e = fromIntegral (ord '\\')
|
|
|
|
q = fromIntegral (ord '"')
|
|
|
|
del = 0x7F
|
|
|
|
iscontrol c = c < 0x20
|
|
|
|
|
|
|
|
echar 0x7 = ec 'a'
|
|
|
|
echar 0x8 = ec 'b'
|
|
|
|
echar 0x0C = ec 'f'
|
|
|
|
echar 0x0A = ec 'n'
|
|
|
|
echar 0x0D = ec 'r'
|
|
|
|
echar 0x09 = ec 't'
|
|
|
|
echar 0x0B = ec 'v'
|
2012-12-13 04:24:19 +00:00
|
|
|
echar c
|
2023-04-07 21:20:58 +00:00
|
|
|
| iscontrol c = showoctal c -- other control characters
|
2023-04-07 20:47:26 +00:00
|
|
|
| c == e = ec '\\' -- escape the escape character itself
|
|
|
|
| c == del = showoctal c
|
|
|
|
| p c = if c == q
|
|
|
|
then ec '"' -- escape double quote
|
|
|
|
else showoctal c
|
|
|
|
| otherwise = S.singleton c
|
2023-04-07 21:20:58 +00:00
|
|
|
|
|
|
|
needencode c = iscontrol c || c == e || c == del || p c
|
|
|
|
|
|
|
|
ec c = S.pack [e, fromIntegral (ord c)]
|
2023-04-07 20:47:26 +00:00
|
|
|
|
|
|
|
showoctal i = encodeBS ('\\' : printf "%03o" i)
|
|
|
|
|
|
|
|
isUtf8Byte :: Word8 -> Bool
|
|
|
|
isUtf8Byte c = c >= 0x80
|
2011-12-23 00:14:35 +00:00
|
|
|
|
fix failing quickcheck properties
QuickCheck 2.10 found a counterexample eg "\929184" broke the property.
As far as I can tell, Git.Filename is matching how git handles encoding
of strange high unicode characters in filenames for display. Git does
not display high unicode characters, and instead displays the C-style
escaped form of each byte. This is ambiguous, but since git is not
unicode aware, it doesn't need to roundtrip parse it.
So, making Git.FileName's roundtrip test only chars < 256 seems fine.
Utility.Format.format uses encode_c, in order to mimic git, so that's
ok.
Utility.Format.gen uses decode_c, but only so that stuff like "\n"
in the format string is handled. If the format string contains C-style
octal escapes, they will be converted to ascii characters, and not
combined into unicode characters, but that should not be a problem.
If the user wants unicode characters, they can include them in the
format string, without escaping them.
Finally, decode_c is used by Utility.Gpg.secretKeys, because gpg
--with-colons hex-escapes some characters in particular ':' and '\\'.
gpg passes unicode through, so this use of decode_c is not a problem.
This commit was sponsored by Henrik Riomar on Patreon.
2017-06-17 20:17:09 +00:00
|
|
|
{- For quickcheck.
|
|
|
|
-
|
|
|
|
- Encoding and then decoding roundtrips only when
|
2019-12-30 17:54:46 +00:00
|
|
|
- the string is ascii because eg, both "\12345" and
|
|
|
|
- "\227\128\185" are encoded to "\343\200\271".
|
fix failing quickcheck properties
QuickCheck 2.10 found a counterexample eg "\929184" broke the property.
As far as I can tell, Git.Filename is matching how git handles encoding
of strange high unicode characters in filenames for display. Git does
not display high unicode characters, and instead displays the C-style
escaped form of each byte. This is ambiguous, but since git is not
unicode aware, it doesn't need to roundtrip parse it.
So, making Git.FileName's roundtrip test only chars < 256 seems fine.
Utility.Format.format uses encode_c, in order to mimic git, so that's
ok.
Utility.Format.gen uses decode_c, but only so that stuff like "\n"
in the format string is handled. If the format string contains C-style
octal escapes, they will be converted to ascii characters, and not
combined into unicode characters, but that should not be a problem.
If the user wants unicode characters, they can include them in the
format string, without escaping them.
Finally, decode_c is used by Utility.Gpg.secretKeys, because gpg
--with-colons hex-escapes some characters in particular ':' and '\\'.
gpg passes unicode through, so this use of decode_c is not a problem.
This commit was sponsored by Henrik Riomar on Patreon.
2017-06-17 20:17:09 +00:00
|
|
|
-
|
2019-12-30 17:54:46 +00:00
|
|
|
- This property papers over the problem, by only testing ascii.
|
fix failing quickcheck properties
QuickCheck 2.10 found a counterexample eg "\929184" broke the property.
As far as I can tell, Git.Filename is matching how git handles encoding
of strange high unicode characters in filenames for display. Git does
not display high unicode characters, and instead displays the C-style
escaped form of each byte. This is ambiguous, but since git is not
unicode aware, it doesn't need to roundtrip parse it.
So, making Git.FileName's roundtrip test only chars < 256 seems fine.
Utility.Format.format uses encode_c, in order to mimic git, so that's
ok.
Utility.Format.gen uses decode_c, but only so that stuff like "\n"
in the format string is handled. If the format string contains C-style
octal escapes, they will be converted to ascii characters, and not
combined into unicode characters, but that should not be a problem.
If the user wants unicode characters, they can include them in the
format string, without escaping them.
Finally, decode_c is used by Utility.Gpg.secretKeys, because gpg
--with-colons hex-escapes some characters in particular ':' and '\\'.
gpg passes unicode through, so this use of decode_c is not a problem.
This commit was sponsored by Henrik Riomar on Patreon.
2017-06-17 20:17:09 +00:00
|
|
|
-}
|
|
|
|
prop_encode_c_decode_c_roundtrip :: String -> Bool
|
2023-04-07 20:47:26 +00:00
|
|
|
prop_encode_c_decode_c_roundtrip s = s' ==
|
|
|
|
decodeBS (decode_c (encode_c isUtf8Byte (encodeBS s')))
|
fix failing quickcheck properties
QuickCheck 2.10 found a counterexample eg "\929184" broke the property.
As far as I can tell, Git.Filename is matching how git handles encoding
of strange high unicode characters in filenames for display. Git does
not display high unicode characters, and instead displays the C-style
escaped form of each byte. This is ambiguous, but since git is not
unicode aware, it doesn't need to roundtrip parse it.
So, making Git.FileName's roundtrip test only chars < 256 seems fine.
Utility.Format.format uses encode_c, in order to mimic git, so that's
ok.
Utility.Format.gen uses decode_c, but only so that stuff like "\n"
in the format string is handled. If the format string contains C-style
octal escapes, they will be converted to ascii characters, and not
combined into unicode characters, but that should not be a problem.
If the user wants unicode characters, they can include them in the
format string, without escaping them.
Finally, decode_c is used by Utility.Gpg.secretKeys, because gpg
--with-colons hex-escapes some characters in particular ':' and '\\'.
gpg passes unicode through, so this use of decode_c is not a problem.
This commit was sponsored by Henrik Riomar on Patreon.
2017-06-17 20:17:09 +00:00
|
|
|
where
|
2019-12-30 17:54:46 +00:00
|
|
|
s' = filter isAscii s
|