simplify and speed up Utility.FileSystemEncoding
This eliminates the distinction between decodeBS and decodeBS', encodeBS and encodeBS', etc. The old implementation truncated at NUL, and the primed versions had to do extra work to avoid that problem. The new implementation does not truncate at NUL, and is also a lot faster. (Benchmarked at 2x faster for decodeBS and 3x for encodeBS; more for the primed versions.) Note that filepath-bytestring 1.4.2.1.8 contains the same optimisation, and upgrading to it will speed up to/fromRawFilePath. AFAIK, nothing relied on the old behavior of truncating at NUL. Some code used the faster versions in places where I was sure there would not be a NUL. So this change is unlikely to break anything. Also, moved s2w8 and w82s out of the module, as they do not involve filesystem encoding really. Sponsored-by: Shae Erisson on Patreon
This commit is contained in:
parent
a38b724bfa
commit
fa62c98910
55 changed files with 138 additions and 217 deletions
|
@ -1,6 +1,6 @@
|
|||
{- GHC File system encoding handling.
|
||||
-
|
||||
- Copyright 2012-2016 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2012-2021 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- License: BSD-2-clause
|
||||
-}
|
||||
|
@ -18,36 +18,22 @@ module Utility.FileSystemEncoding (
|
|||
encodeBL,
|
||||
decodeBS,
|
||||
encodeBS,
|
||||
decodeBL',
|
||||
encodeBL',
|
||||
decodeBS',
|
||||
encodeBS',
|
||||
truncateFilePath,
|
||||
s2w8,
|
||||
w82s,
|
||||
c2w8,
|
||||
w82c,
|
||||
) where
|
||||
|
||||
import qualified GHC.Foreign as GHC
|
||||
import qualified GHC.IO.Encoding as Encoding
|
||||
import Foreign.C
|
||||
import System.IO
|
||||
import System.IO.Unsafe
|
||||
import Data.Word
|
||||
import System.FilePath.ByteString (RawFilePath, encodeFilePath, decodeFilePath)
|
||||
import qualified Data.ByteString as S
|
||||
import qualified Data.ByteString.Lazy as L
|
||||
import Data.ByteString.Unsafe (unsafePackMallocCStringLen)
|
||||
#ifdef mingw32_HOST_OS
|
||||
import qualified Data.ByteString.UTF8 as S8
|
||||
import qualified Data.ByteString.Lazy.UTF8 as L8
|
||||
#else
|
||||
import Data.List
|
||||
import Utility.Split
|
||||
#endif
|
||||
|
||||
import Utility.Exception
|
||||
|
||||
{- Makes all subsequent Handles that are opened, as well as stdio Handles,
|
||||
- use the filesystem encoding, instead of the encoding of the current
|
||||
- locale.
|
||||
|
@ -80,32 +66,10 @@ fileEncoding h = hSetEncoding h =<< Encoding.getFileSystemEncoding
|
|||
fileEncoding h = hSetEncoding h Encoding.utf8
|
||||
#endif
|
||||
|
||||
{- Encodes a FilePath into a String, applying the filesystem encoding.
|
||||
-
|
||||
- There are very few things it makes sense to do with such an encoded
|
||||
- string. It's not a legal filename; it should not be displayed.
|
||||
- So this function is not exported, but instead used by the few functions
|
||||
- that can usefully consume it.
|
||||
-
|
||||
- This use of unsafePerformIO is belived to be safe; GHC's interface
|
||||
- only allows doing this conversion with CStrings, and the CString buffer
|
||||
- is allocated, used, and deallocated within the call, with no side
|
||||
- effects.
|
||||
-
|
||||
- If the FilePath contains a value that is not legal in the filesystem
|
||||
- encoding, rather than thowing an exception, it will be returned as-is.
|
||||
-}
|
||||
{-# NOINLINE _encodeFilePath #-}
|
||||
_encodeFilePath :: FilePath -> String
|
||||
_encodeFilePath fp = unsafePerformIO $ do
|
||||
enc <- Encoding.getFileSystemEncoding
|
||||
GHC.withCString enc fp (GHC.peekCString Encoding.char8)
|
||||
`catchNonAsync` (\_ -> return fp)
|
||||
|
||||
{- Decodes a ByteString into a FilePath, applying the filesystem encoding. -}
|
||||
decodeBL :: L.ByteString -> FilePath
|
||||
#ifndef mingw32_HOST_OS
|
||||
decodeBL = encodeW8NUL . L.unpack
|
||||
decodeBL = decodeBS . L.toStrict
|
||||
#else
|
||||
{- On Windows, we assume that the ByteString is utf-8, since Windows
|
||||
- only uses unicode for filenames. -}
|
||||
|
@ -115,104 +79,45 @@ decodeBL = L8.toString
|
|||
{- Encodes a FilePath into a ByteString, applying the filesystem encoding. -}
|
||||
encodeBL :: FilePath -> L.ByteString
|
||||
#ifndef mingw32_HOST_OS
|
||||
encodeBL = L.pack . decodeW8NUL
|
||||
encodeBL = L.fromStrict . encodeBS
|
||||
#else
|
||||
encodeBL = L8.fromString
|
||||
#endif
|
||||
|
||||
decodeBS :: S.ByteString -> FilePath
|
||||
#ifndef mingw32_HOST_OS
|
||||
decodeBS = encodeW8NUL . S.unpack
|
||||
-- This is a copy of code from System.FilePath.Internal.decodeFilePath.
|
||||
-- However, older versions of that library truncated at NUL, which this
|
||||
-- must not do, because it may end up used on something other than a unix
|
||||
-- filepath.
|
||||
{-# NOINLINE decodeBS #-}
|
||||
decodeBS b = unsafePerformIO $ do
|
||||
enc <- Encoding.getFileSystemEncoding
|
||||
S.useAsCStringLen b (GHC.peekCStringLen enc)
|
||||
#else
|
||||
decodeBS = S8.toString
|
||||
#endif
|
||||
|
||||
encodeBS :: FilePath -> S.ByteString
|
||||
#ifndef mingw32_HOST_OS
|
||||
encodeBS = S.pack . decodeW8NUL
|
||||
-- This is a copy of code from System.FilePath.Internal.encodeFilePath.
|
||||
-- However, older versions of that library truncated at NUL, which this
|
||||
-- must not do, because it may end up used on something other than a unix
|
||||
-- filepath.
|
||||
{-# NOINLINE encodeBS #-}
|
||||
encodeBS f = unsafePerformIO $ do
|
||||
enc <- Encoding.getFileSystemEncoding
|
||||
GHC.newCStringLen enc f >>= unsafePackMallocCStringLen
|
||||
#else
|
||||
encodeBS = S8.fromString
|
||||
#endif
|
||||
|
||||
{- Faster version that assumes the string does not contain NUL;
|
||||
- if it does it will be truncated before the NUL. -}
|
||||
decodeBS' :: S.ByteString -> FilePath
|
||||
#ifndef mingw32_HOST_OS
|
||||
decodeBS' = encodeW8 . S.unpack
|
||||
#else
|
||||
decodeBS' = S8.toString
|
||||
#endif
|
||||
|
||||
encodeBS' :: FilePath -> S.ByteString
|
||||
#ifndef mingw32_HOST_OS
|
||||
encodeBS' = S.pack . decodeW8
|
||||
#else
|
||||
encodeBS' = S8.fromString
|
||||
#endif
|
||||
|
||||
decodeBL' :: L.ByteString -> FilePath
|
||||
#ifndef mingw32_HOST_OS
|
||||
decodeBL' = encodeW8 . L.unpack
|
||||
#else
|
||||
decodeBL' = L8.toString
|
||||
#endif
|
||||
|
||||
encodeBL' :: FilePath -> L.ByteString
|
||||
#ifndef mingw32_HOST_OS
|
||||
encodeBL' = L.pack . decodeW8
|
||||
#else
|
||||
encodeBL' = L8.fromString
|
||||
#endif
|
||||
|
||||
fromRawFilePath :: RawFilePath -> FilePath
|
||||
fromRawFilePath = decodeFilePath
|
||||
|
||||
toRawFilePath :: FilePath -> RawFilePath
|
||||
toRawFilePath = encodeFilePath
|
||||
|
||||
#ifndef mingw32_HOST_OS
|
||||
{- Converts a [Word8] to a FilePath, encoding using the filesystem encoding.
|
||||
-
|
||||
- w82s produces a String, which may contain Chars that are invalid
|
||||
- unicode. From there, this is really a simple matter of applying the
|
||||
- file system encoding, only complicated by GHC's interface to doing so.
|
||||
-
|
||||
- Note that the encoding stops at any NUL in the input. FilePaths
|
||||
- cannot contain embedded NUL, but Haskell Strings may.
|
||||
-}
|
||||
{-# NOINLINE encodeW8 #-}
|
||||
encodeW8 :: [Word8] -> FilePath
|
||||
encodeW8 w8 = unsafePerformIO $ do
|
||||
enc <- Encoding.getFileSystemEncoding
|
||||
GHC.withCString Encoding.char8 (w82s w8) $ GHC.peekCString enc
|
||||
|
||||
decodeW8 :: FilePath -> [Word8]
|
||||
decodeW8 = s2w8 . _encodeFilePath
|
||||
|
||||
{- Like encodeW8 and decodeW8, but NULs are passed through unchanged. -}
|
||||
encodeW8NUL :: [Word8] -> FilePath
|
||||
encodeW8NUL = intercalate [nul] . map encodeW8 . splitc (c2w8 nul)
|
||||
where
|
||||
nul = '\NUL'
|
||||
|
||||
decodeW8NUL :: FilePath -> [Word8]
|
||||
decodeW8NUL = intercalate [c2w8 nul] . map decodeW8 . splitc nul
|
||||
where
|
||||
nul = '\NUL'
|
||||
#endif
|
||||
|
||||
c2w8 :: Char -> Word8
|
||||
c2w8 = fromIntegral . fromEnum
|
||||
|
||||
w82c :: Word8 -> Char
|
||||
w82c = toEnum . fromIntegral
|
||||
|
||||
s2w8 :: String -> [Word8]
|
||||
s2w8 = map c2w8
|
||||
|
||||
w82s :: [Word8] -> String
|
||||
w82s = map w82c
|
||||
|
||||
{- Truncates a FilePath to the given number of bytes (or less),
|
||||
- as represented on disk.
|
||||
-
|
||||
|
@ -224,8 +129,8 @@ truncateFilePath :: Int -> FilePath -> FilePath
|
|||
truncateFilePath n = go . reverse
|
||||
where
|
||||
go f =
|
||||
let bytes = decodeW8 f
|
||||
in if length bytes <= n
|
||||
let b = encodeBS f
|
||||
in if S.length b <= n
|
||||
then reverse f
|
||||
else go (drop 1 f)
|
||||
#else
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue