optimize truncateFilePath

Often the filepath will be all ascii, or mostly so, and this
optimisation makes a file that has an ascii suffix of sufficient length
be roundtrip converted between String and ByteString only once, rather
than once per character.

Sponsored-by: Graham Spencer
This commit is contained in:
Joey Hess 2025-01-22 12:34:54 -04:00
parent 6211e2af4a
commit f17ec601c4
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
4 changed files with 47 additions and 26 deletions

View file

@ -29,12 +29,12 @@ import Data.Word
genKeyName :: String -> S.ShortByteString
genKeyName s
-- Avoid making keys longer than the length of a SHA256 checksum.
| bytelen > sha256len = S.toShort $ encodeBS $
truncateFilePath (sha256len - md5len - 1) s' ++ "-" ++
show (md5 bl)
| otherwise = S.toShort $ encodeBS s'
| bytelen > sha256len = S.toShort $
truncateFilePath (sha256len - md5len - 1) s'
<> "-" <> encodeBS (show (md5 bl))
| otherwise = S.toShort s'
where
s' = preSanitizeKeyName s
s' = encodeBS $ preSanitizeKeyName s
bl = encodeBL s
bytelen = fromIntegral $ L.length bl

View file

@ -200,12 +200,12 @@ checkUrl addunlockedmatcher r o si u = do
startRemote :: AddUnlockedMatcher -> Remote -> AddUrlOptions -> SeekInput -> FilePath -> URLString -> Maybe Integer -> CommandStart
startRemote addunlockedmatcher r o si file uri sz = do
pathmax <- liftIO $ fileNameLengthLimit "."
let file' = joinPath $ map (truncateFilePath pathmax) $
splitDirectories file
let file' = P.joinPath $ map (truncateFilePath pathmax) $
P.splitDirectories (toRawFilePath file)
startingAddUrl si uri o $ do
showNote $ UnquotedString $ "from " ++ Remote.name r
showDestinationFile (toRawFilePath file')
performRemote addunlockedmatcher r o uri (toRawFilePath file') sz
showDestinationFile file'
performRemote addunlockedmatcher r o uri file' sz
performRemote :: AddUnlockedMatcher -> Remote -> AddUrlOptions -> URLString -> RawFilePath -> Maybe Integer -> CommandPerform
performRemote addunlockedmatcher r o uri file sz = lookupKey file >>= \case
@ -279,7 +279,8 @@ sanitizeOrPreserveFilePath o f
return f
| otherwise = do
pathmax <- liftIO $ fileNameLengthLimit "."
return $ truncateFilePath pathmax $ sanitizeFilePath f
return $ fromRawFilePath $ truncateFilePath pathmax $
toRawFilePath $ sanitizeFilePath f
-- sanitizeFilePath avoids all these security problems
-- (and probably others, but at least this catches the most egrarious ones).
@ -567,8 +568,8 @@ nodownloadWeb' o addunlockedmatcher url key file = checkCanAdd o file $ \canadd
url2file :: URI -> Maybe Int -> Int -> FilePath
url2file url pathdepth pathmax = case pathdepth of
Nothing -> truncateFilePath pathmax $ sanitizeFilePath fullurl
Just depth
Nothing -> truncatesanitize fullurl
Just depth
| depth >= length urlbits -> frombits id
| depth > 0 -> frombits $ drop depth
| depth < 0 -> frombits $ reverse . take (negate depth) . reverse
@ -580,8 +581,12 @@ url2file url pathdepth pathmax = case pathdepth of
, uriQuery url
]
frombits a = intercalate "/" $ a urlbits
urlbits = map (truncateFilePath pathmax . sanitizeFilePath) $
urlbits = map truncatesanitize $
filter (not . null) $ splitc '/' fullurl
truncatesanitize = fromRawFilePath
. truncateFilePath pathmax
. toRawFilePath
. sanitizeFilePath
urlString2file :: URLString -> Maybe Int -> Int -> FilePath
urlString2file s pathdepth pathmax = case Url.parseURIRelaxed s of

View file

@ -33,6 +33,8 @@ import qualified Data.ByteString.Lazy.UTF8 as L8
import qualified GHC.Foreign as GHC
import System.IO.Unsafe
import Data.ByteString.Unsafe (unsafePackMallocCStringLen)
import Data.Char
import Data.List
#endif
{- Makes all subsequent Handles that are opened, as well as stdio Handles,
@ -125,26 +127,40 @@ toRawFilePath = encodeFilePath
- Avoids returning an invalid part of a unicode byte sequence, at the
- cost of efficiency when running on a large FilePath.
-}
truncateFilePath :: Int -> FilePath -> FilePath
truncateFilePath :: Int -> RawFilePath -> RawFilePath
#ifndef mingw32_HOST_OS
truncateFilePath n = go . reverse
{- On unix, do not assume a unicode locale, but does assume ascii
- characters are a single byte. -}
truncateFilePath n b =
let blen = S.length b
in if blen <= n
then b
else go blen (reverse (fromRawFilePath b))
where
go f =
let b = encodeBS f
in if S.length b <= n
then reverse f
else go (drop 1 f)
go blen f = case uncons f of
Just (c, f')
| isAscii c ->
let blen' = blen - 1
in if blen' <= n
then toRawFilePath (reverse f')
else go blen' f'
| otherwise ->
let blen' = S.length (toRawFilePath f')
in if blen' <= n
then toRawFilePath (reverse f')
else go blen' f'
Nothing -> toRawFilePath (reverse f)
#else
{- On Windows, count the number of bytes used by each utf8 character. -}
truncateFilePath n = reverse . go [] n . L8.fromString
truncateFilePath n = toRawFilePath . reverse . go [] n
where
go coll cnt bs
| cnt <= 0 = coll
| otherwise = case L8.decode bs of
Just (c, x) | c /= L8.replacement_char ->
| otherwise = case S8.decode bs of
Just (c, x) | c /= S8.replacement_char ->
let x' = fromIntegral x
in if cnt - x' < 0
then coll
else go (c:coll) (cnt - x') (L8.drop 1 bs)
else go (c:coll) (cnt - x') (S8.drop 1 bs)
_ -> coll
#endif

View file

@ -112,8 +112,8 @@ relatedTemplate f
{- Some filesystems like FAT have issues with filenames
- ending in ".", so avoid truncating a filename to end
- that way. -}
toOsPath $ B.dropWhileEnd (== dot) $ toRawFilePath $
truncateFilePath (len - templateAddedLength) (fromRawFilePath f)
toOsPath $ B.dropWhileEnd (== dot) $
truncateFilePath (len - templateAddedLength) f
| otherwise = toOsPath f
where
len = B.length f