2023-04-08 19:48:32 +00:00
|
|
|
{- Some git commands output quoted filenames, in a rather annoyingly complex
|
2011-12-13 19:22:43 +00:00
|
|
|
- C-style encoding.
|
|
|
|
-
|
2023-04-07 20:47:26 +00:00
|
|
|
- Copyright 2010-2023 Joey Hess <id@joeyh.name>
|
2011-12-13 19:22:43 +00:00
|
|
|
-
|
2019-03-13 19:48:14 +00:00
|
|
|
- Licensed under the GNU AGPL version 3 or higher.
|
2011-12-13 19:22:43 +00:00
|
|
|
-}
|
|
|
|
|
2023-04-08 19:48:32 +00:00
|
|
|
{-# LANGUAGE OverloadedStrings, TypeSynonymInstances #-}
|
2023-04-07 20:47:26 +00:00
|
|
|
|
2023-04-08 19:48:32 +00:00
|
|
|
module Git.Filename (
|
|
|
|
unquote,
|
|
|
|
quote,
|
|
|
|
QuotePath(..),
|
|
|
|
StringContainingQuotedPath(..),
|
2023-04-10 16:56:45 +00:00
|
|
|
quotedPaths,
|
2023-04-08 19:48:32 +00:00
|
|
|
prop_quote_unquote_roundtrip,
|
|
|
|
) where
|
2011-12-13 19:22:43 +00:00
|
|
|
|
fix failing quickcheck properties
QuickCheck 2.10 found a counterexample eg "\929184" broke the property.
As far as I can tell, Git.Filename is matching how git handles encoding
of strange high unicode characters in filenames for display. Git does
not display high unicode characters, and instead displays the C-style
escaped form of each byte. This is ambiguous, but since git is not
unicode aware, it doesn't need to roundtrip parse it.
So, making Git.FileName's roundtrip test only chars < 256 seems fine.
Utility.Format.format uses encode_c, in order to mimic git, so that's
ok.
Utility.Format.gen uses decode_c, but only so that stuff like "\n"
in the format string is handled. If the format string contains C-style
octal escapes, they will be converted to ascii characters, and not
combined into unicode characters, but that should not be a problem.
If the user wants unicode characters, they can include them in the
format string, without escaping them.
Finally, decode_c is used by Utility.Gpg.secretKeys, because gpg
--with-colons hex-escapes some characters in particular ':' and '\\'.
gpg passes unicode through, so this use of decode_c is not a problem.
This commit was sponsored by Henrik Riomar on Patreon.
2017-06-17 20:17:09 +00:00
|
|
|
import Common
|
2023-04-07 21:12:55 +00:00
|
|
|
import Utility.Format (decode_c, encode_c, encode_c', isUtf8Byte)
|
2020-11-10 00:07:31 +00:00
|
|
|
import Utility.QuickCheck
|
2011-12-13 19:22:43 +00:00
|
|
|
|
fix failing quickcheck properties
QuickCheck 2.10 found a counterexample eg "\929184" broke the property.
As far as I can tell, Git.Filename is matching how git handles encoding
of strange high unicode characters in filenames for display. Git does
not display high unicode characters, and instead displays the C-style
escaped form of each byte. This is ambiguous, but since git is not
unicode aware, it doesn't need to roundtrip parse it.
So, making Git.FileName's roundtrip test only chars < 256 seems fine.
Utility.Format.format uses encode_c, in order to mimic git, so that's
ok.
Utility.Format.gen uses decode_c, but only so that stuff like "\n"
in the format string is handled. If the format string contains C-style
octal escapes, they will be converted to ascii characters, and not
combined into unicode characters, but that should not be a problem.
If the user wants unicode characters, they can include them in the
format string, without escaping them.
Finally, decode_c is used by Utility.Gpg.secretKeys, because gpg
--with-colons hex-escapes some characters in particular ':' and '\\'.
gpg passes unicode through, so this use of decode_c is not a problem.
This commit was sponsored by Henrik Riomar on Patreon.
2017-06-17 20:17:09 +00:00
|
|
|
import Data.Char
|
2019-11-25 20:18:19 +00:00
|
|
|
import Data.Word
|
2023-04-08 19:48:32 +00:00
|
|
|
import Data.String
|
2019-11-25 20:18:19 +00:00
|
|
|
import qualified Data.ByteString as S
|
2023-04-08 19:48:32 +00:00
|
|
|
import qualified Data.Semigroup as Sem
|
|
|
|
import Prelude
|
2011-12-20 18:37:53 +00:00
|
|
|
|
2023-04-08 19:48:32 +00:00
|
|
|
unquote :: S.ByteString -> RawFilePath
|
|
|
|
unquote b = case S.uncons b of
|
2019-11-25 20:18:19 +00:00
|
|
|
Nothing -> b
|
|
|
|
Just (h, t)
|
|
|
|
| h /= q -> b
|
|
|
|
| otherwise -> case S.unsnoc t of
|
|
|
|
Nothing -> b
|
|
|
|
Just (i, l)
|
|
|
|
| l /= q -> b
|
2023-04-07 18:44:19 +00:00
|
|
|
| otherwise -> decode_c i
|
2019-11-25 20:18:19 +00:00
|
|
|
where
|
|
|
|
q :: Word8
|
|
|
|
q = fromIntegral (ord '"')
|
2011-12-13 19:22:43 +00:00
|
|
|
|
2023-04-07 20:47:26 +00:00
|
|
|
-- always encodes and double quotes, even in cases that git does not
|
2023-04-08 19:48:32 +00:00
|
|
|
quoteAlways :: RawFilePath -> S.ByteString
|
|
|
|
quoteAlways s = "\"" <> encode_c needencode s <> "\""
|
2023-04-07 20:47:26 +00:00
|
|
|
where
|
|
|
|
needencode c = isUtf8Byte c || c == fromIntegral (ord '"')
|
2011-12-13 19:22:43 +00:00
|
|
|
|
2023-04-07 21:12:55 +00:00
|
|
|
-- git config core.quotePath controls whether to quote unicode characters
|
|
|
|
newtype QuotePath = QuotePath Bool
|
|
|
|
|
2023-04-08 19:48:32 +00:00
|
|
|
class Quoteable t where
|
|
|
|
-- double quotes and encodes when git would
|
|
|
|
quote :: QuotePath -> t -> S.ByteString
|
|
|
|
|
|
|
|
instance Quoteable RawFilePath where
|
|
|
|
quote (QuotePath qp) s = case encode_c' needencode s of
|
|
|
|
Nothing -> s
|
|
|
|
Just s' -> "\"" <> s' <> "\""
|
|
|
|
where
|
|
|
|
needencode c
|
|
|
|
| c == fromIntegral (ord '"') = True
|
|
|
|
| qp = isUtf8Byte c
|
|
|
|
| otherwise = False
|
|
|
|
|
|
|
|
-- Allows building up a string that contains paths, which will get quoted.
|
|
|
|
-- With OverloadedStrings, strings are passed through without quoting.
|
|
|
|
-- Eg: QuotedPath f <> ": not found"
|
|
|
|
data StringContainingQuotedPath
|
|
|
|
= UnquotedString String
|
|
|
|
| QuotedPath RawFilePath
|
2023-04-09 16:53:13 +00:00
|
|
|
| StringContainingQuotedPath :+: StringContainingQuotedPath
|
2023-04-08 19:48:32 +00:00
|
|
|
deriving (Show, Eq)
|
|
|
|
|
2023-04-10 16:56:45 +00:00
|
|
|
quotedPaths :: [RawFilePath] -> StringContainingQuotedPath
|
|
|
|
quotedPaths [] = mempty
|
|
|
|
quotedPaths (p:ps) = QuotedPath p <> if null ps
|
|
|
|
then mempty
|
|
|
|
else " " <> quotedPaths ps
|
|
|
|
|
2023-04-08 19:48:32 +00:00
|
|
|
instance Quoteable StringContainingQuotedPath where
|
|
|
|
quote _ (UnquotedString s) = encodeBS s
|
|
|
|
quote qp (QuotedPath p) = quote qp p
|
2023-04-09 16:53:13 +00:00
|
|
|
quote qp (a :+: b) = quote qp a <> quote qp b
|
2023-04-08 19:48:32 +00:00
|
|
|
|
|
|
|
instance IsString StringContainingQuotedPath where
|
|
|
|
fromString = UnquotedString
|
|
|
|
|
|
|
|
instance Sem.Semigroup StringContainingQuotedPath where
|
|
|
|
UnquotedString a <> UnquotedString b = UnquotedString (a <> b)
|
2023-04-09 16:53:13 +00:00
|
|
|
a <> b = a :+: b
|
2023-04-08 19:48:32 +00:00
|
|
|
|
|
|
|
instance Monoid StringContainingQuotedPath where
|
|
|
|
mempty = UnquotedString mempty
|
2023-04-07 21:12:55 +00:00
|
|
|
|
2020-11-10 00:07:31 +00:00
|
|
|
-- Encoding and then decoding roundtrips only when the string does not
|
2023-04-07 21:12:55 +00:00
|
|
|
-- contain high unicode, because eg, both "\12345" and "\227\128\185"
|
2020-11-10 00:07:31 +00:00
|
|
|
-- are encoded to "\343\200\271".
|
|
|
|
--
|
|
|
|
-- That is not a real-world problem, and using TestableFilePath
|
|
|
|
-- limits what's tested to ascii, so avoids running into it.
|
2023-04-08 19:48:32 +00:00
|
|
|
prop_quote_unquote_roundtrip :: TestableFilePath -> Bool
|
|
|
|
prop_quote_unquote_roundtrip ts =
|
|
|
|
s == fromRawFilePath (unquote (quoteAlways (toRawFilePath s)))
|
2019-12-06 16:14:55 +00:00
|
|
|
where
|
2020-11-10 00:07:31 +00:00
|
|
|
s = fromTestableFilePath ts
|