2011-12-13 19:22:43 +00:00
|
|
|
{- Some git commands output encoded filenames, in a rather annoyingly complex
|
|
|
|
- C-style encoding.
|
|
|
|
-
|
2015-01-21 16:50:09 +00:00
|
|
|
- Copyright 2010, 2011 Joey Hess <id@joeyh.name>
|
2011-12-13 19:22:43 +00:00
|
|
|
-
|
2019-03-13 19:48:14 +00:00
|
|
|
- Licensed under the GNU AGPL version 3 or higher.
|
2011-12-13 19:22:43 +00:00
|
|
|
-}
|
|
|
|
|
|
|
|
module Git.Filename where
|
|
|
|
|
fix failing quickcheck properties
QuickCheck 2.10 found a counterexample eg "\929184" broke the property.
As far as I can tell, Git.Filename is matching how git handles encoding
of strange high unicode characters in filenames for display. Git does
not display high unicode characters, and instead displays the C-style
escaped form of each byte. This is ambiguous, but since git is not
unicode aware, it doesn't need to roundtrip parse it.
So, making Git.FileName's roundtrip test only chars < 256 seems fine.
Utility.Format.format uses encode_c, in order to mimic git, so that's
ok.
Utility.Format.gen uses decode_c, but only so that stuff like "\n"
in the format string is handled. If the format string contains C-style
octal escapes, they will be converted to ascii characters, and not
combined into unicode characters, but that should not be a problem.
If the user wants unicode characters, they can include them in the
format string, without escaping them.
Finally, decode_c is used by Utility.Gpg.secretKeys, because gpg
--with-colons hex-escapes some characters in particular ':' and '\\'.
gpg passes unicode through, so this use of decode_c is not a problem.
This commit was sponsored by Henrik Riomar on Patreon.
2017-06-17 20:17:09 +00:00
|
|
|
import Common
|
2011-12-23 00:14:35 +00:00
|
|
|
import Utility.Format (decode_c, encode_c)
|
2011-12-13 19:22:43 +00:00
|
|
|
|
fix failing quickcheck properties
QuickCheck 2.10 found a counterexample eg "\929184" broke the property.
As far as I can tell, Git.Filename is matching how git handles encoding
of strange high unicode characters in filenames for display. Git does
not display high unicode characters, and instead displays the C-style
escaped form of each byte. This is ambiguous, but since git is not
unicode aware, it doesn't need to roundtrip parse it.
So, making Git.FileName's roundtrip test only chars < 256 seems fine.
Utility.Format.format uses encode_c, in order to mimic git, so that's
ok.
Utility.Format.gen uses decode_c, but only so that stuff like "\n"
in the format string is handled. If the format string contains C-style
octal escapes, they will be converted to ascii characters, and not
combined into unicode characters, but that should not be a problem.
If the user wants unicode characters, they can include them in the
format string, without escaping them.
Finally, decode_c is used by Utility.Gpg.secretKeys, because gpg
--with-colons hex-escapes some characters in particular ':' and '\\'.
gpg passes unicode through, so this use of decode_c is not a problem.
This commit was sponsored by Henrik Riomar on Patreon.
2017-06-17 20:17:09 +00:00
|
|
|
import Data.Char
|
2019-11-25 20:18:19 +00:00
|
|
|
import Data.Word
|
|
|
|
import qualified Data.ByteString as S
|
2011-12-20 18:37:53 +00:00
|
|
|
|
2019-11-25 20:18:19 +00:00
|
|
|
-- encoded filenames will be inside double quotes
|
|
|
|
decode :: S.ByteString -> RawFilePath
|
|
|
|
decode b = case S.uncons b of
|
|
|
|
Nothing -> b
|
|
|
|
Just (h, t)
|
|
|
|
| h /= q -> b
|
|
|
|
| otherwise -> case S.unsnoc t of
|
|
|
|
Nothing -> b
|
|
|
|
Just (i, l)
|
|
|
|
| l /= q -> b
|
|
|
|
| otherwise ->
|
|
|
|
encodeBS $ decode_c $ decodeBS i
|
|
|
|
where
|
|
|
|
q :: Word8
|
|
|
|
q = fromIntegral (ord '"')
|
2011-12-13 19:22:43 +00:00
|
|
|
|
|
|
|
{- Should not need to use this, except for testing decode. -}
|
2019-11-25 20:18:19 +00:00
|
|
|
encode :: RawFilePath -> S.ByteString
|
|
|
|
encode s = encodeBS $ "\"" ++ encode_c (decodeBS s) ++ "\""
|
2011-12-13 19:22:43 +00:00
|
|
|
|
2019-11-26 19:27:22 +00:00
|
|
|
prop_encode_decode_roundtrip :: FilePath -> Bool
|
2019-12-06 17:12:35 +00:00
|
|
|
prop_encode_decode_roundtrip s = s' ==
|
|
|
|
fromRawFilePath (decode (encode (toRawFilePath s')))
|
2019-12-06 16:14:55 +00:00
|
|
|
where
|
2019-12-06 17:12:35 +00:00
|
|
|
s' = nonul (nohigh s)
|
|
|
|
-- Encoding and then decoding roundtrips only when
|
|
|
|
-- the string does not contain high unicode, because eg,
|
|
|
|
-- both "\12345" and "\227\128\185" are encoded to
|
|
|
|
-- "\343\200\271".
|
|
|
|
--
|
|
|
|
-- This property papers over the problem, by only
|
|
|
|
-- testing chars < 256.
|
|
|
|
nohigh = filter (\c -> ord c < 256)
|
|
|
|
-- A String can contain a NUL, but toRawFilePath
|
|
|
|
-- truncates on the NUL, which is generally fine
|
|
|
|
-- because unix filenames cannot contain NUL.
|
|
|
|
-- So the encoding only roundtrips when there is no nul.
|
|
|
|
nonul = filter (/= '\NUL')
|