optimize truncateFilePath

Often the filepath will be all ascii, or mostly so, and this
optimisation makes a file that has an ascii suffix of sufficient length
be roundtrip converted between String and ByteString only once, rather
than once per character.

Sponsored-by: Graham Spencer
This commit is contained in:
Joey Hess 2025-01-22 12:34:54 -04:00
parent 6211e2af4a
commit f17ec601c4
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
4 changed files with 47 additions and 26 deletions

View file

@ -29,12 +29,12 @@ import Data.Word
genKeyName :: String -> S.ShortByteString genKeyName :: String -> S.ShortByteString
genKeyName s genKeyName s
-- Avoid making keys longer than the length of a SHA256 checksum. -- Avoid making keys longer than the length of a SHA256 checksum.
| bytelen > sha256len = S.toShort $ encodeBS $ | bytelen > sha256len = S.toShort $
truncateFilePath (sha256len - md5len - 1) s' ++ "-" ++ truncateFilePath (sha256len - md5len - 1) s'
show (md5 bl) <> "-" <> encodeBS (show (md5 bl))
| otherwise = S.toShort $ encodeBS s' | otherwise = S.toShort s'
where where
s' = preSanitizeKeyName s s' = encodeBS $ preSanitizeKeyName s
bl = encodeBL s bl = encodeBL s
bytelen = fromIntegral $ L.length bl bytelen = fromIntegral $ L.length bl

View file

@ -200,12 +200,12 @@ checkUrl addunlockedmatcher r o si u = do
startRemote :: AddUnlockedMatcher -> Remote -> AddUrlOptions -> SeekInput -> FilePath -> URLString -> Maybe Integer -> CommandStart startRemote :: AddUnlockedMatcher -> Remote -> AddUrlOptions -> SeekInput -> FilePath -> URLString -> Maybe Integer -> CommandStart
startRemote addunlockedmatcher r o si file uri sz = do startRemote addunlockedmatcher r o si file uri sz = do
pathmax <- liftIO $ fileNameLengthLimit "." pathmax <- liftIO $ fileNameLengthLimit "."
let file' = joinPath $ map (truncateFilePath pathmax) $ let file' = P.joinPath $ map (truncateFilePath pathmax) $
splitDirectories file P.splitDirectories (toRawFilePath file)
startingAddUrl si uri o $ do startingAddUrl si uri o $ do
showNote $ UnquotedString $ "from " ++ Remote.name r showNote $ UnquotedString $ "from " ++ Remote.name r
showDestinationFile (toRawFilePath file') showDestinationFile file'
performRemote addunlockedmatcher r o uri (toRawFilePath file') sz performRemote addunlockedmatcher r o uri file' sz
performRemote :: AddUnlockedMatcher -> Remote -> AddUrlOptions -> URLString -> RawFilePath -> Maybe Integer -> CommandPerform performRemote :: AddUnlockedMatcher -> Remote -> AddUrlOptions -> URLString -> RawFilePath -> Maybe Integer -> CommandPerform
performRemote addunlockedmatcher r o uri file sz = lookupKey file >>= \case performRemote addunlockedmatcher r o uri file sz = lookupKey file >>= \case
@ -279,7 +279,8 @@ sanitizeOrPreserveFilePath o f
return f return f
| otherwise = do | otherwise = do
pathmax <- liftIO $ fileNameLengthLimit "." pathmax <- liftIO $ fileNameLengthLimit "."
return $ truncateFilePath pathmax $ sanitizeFilePath f return $ fromRawFilePath $ truncateFilePath pathmax $
toRawFilePath $ sanitizeFilePath f
-- sanitizeFilePath avoids all these security problems -- sanitizeFilePath avoids all these security problems
-- (and probably others, but at least this catches the most egrarious ones). -- (and probably others, but at least this catches the most egrarious ones).
@ -567,8 +568,8 @@ nodownloadWeb' o addunlockedmatcher url key file = checkCanAdd o file $ \canadd
url2file :: URI -> Maybe Int -> Int -> FilePath url2file :: URI -> Maybe Int -> Int -> FilePath
url2file url pathdepth pathmax = case pathdepth of url2file url pathdepth pathmax = case pathdepth of
Nothing -> truncateFilePath pathmax $ sanitizeFilePath fullurl Nothing -> truncatesanitize fullurl
Just depth Just depth
| depth >= length urlbits -> frombits id | depth >= length urlbits -> frombits id
| depth > 0 -> frombits $ drop depth | depth > 0 -> frombits $ drop depth
| depth < 0 -> frombits $ reverse . take (negate depth) . reverse | depth < 0 -> frombits $ reverse . take (negate depth) . reverse
@ -580,8 +581,12 @@ url2file url pathdepth pathmax = case pathdepth of
, uriQuery url , uriQuery url
] ]
frombits a = intercalate "/" $ a urlbits frombits a = intercalate "/" $ a urlbits
urlbits = map (truncateFilePath pathmax . sanitizeFilePath) $ urlbits = map truncatesanitize $
filter (not . null) $ splitc '/' fullurl filter (not . null) $ splitc '/' fullurl
truncatesanitize = fromRawFilePath
. truncateFilePath pathmax
. toRawFilePath
. sanitizeFilePath
urlString2file :: URLString -> Maybe Int -> Int -> FilePath urlString2file :: URLString -> Maybe Int -> Int -> FilePath
urlString2file s pathdepth pathmax = case Url.parseURIRelaxed s of urlString2file s pathdepth pathmax = case Url.parseURIRelaxed s of

View file

@ -33,6 +33,8 @@ import qualified Data.ByteString.Lazy.UTF8 as L8
import qualified GHC.Foreign as GHC import qualified GHC.Foreign as GHC
import System.IO.Unsafe import System.IO.Unsafe
import Data.ByteString.Unsafe (unsafePackMallocCStringLen) import Data.ByteString.Unsafe (unsafePackMallocCStringLen)
import Data.Char
import Data.List
#endif #endif
{- Makes all subsequent Handles that are opened, as well as stdio Handles, {- Makes all subsequent Handles that are opened, as well as stdio Handles,
@ -125,26 +127,40 @@ toRawFilePath = encodeFilePath
- Avoids returning an invalid part of a unicode byte sequence, at the - Avoids returning an invalid part of a unicode byte sequence, at the
- cost of efficiency when running on a large FilePath. - cost of efficiency when running on a large FilePath.
-} -}
truncateFilePath :: Int -> FilePath -> FilePath truncateFilePath :: Int -> RawFilePath -> RawFilePath
#ifndef mingw32_HOST_OS #ifndef mingw32_HOST_OS
truncateFilePath n = go . reverse {- On unix, do not assume a unicode locale, but does assume ascii
- characters are a single byte. -}
truncateFilePath n b =
let blen = S.length b
in if blen <= n
then b
else go blen (reverse (fromRawFilePath b))
where where
go f = go blen f = case uncons f of
let b = encodeBS f Just (c, f')
in if S.length b <= n | isAscii c ->
then reverse f let blen' = blen - 1
else go (drop 1 f) in if blen' <= n
then toRawFilePath (reverse f')
else go blen' f'
| otherwise ->
let blen' = S.length (toRawFilePath f')
in if blen' <= n
then toRawFilePath (reverse f')
else go blen' f'
Nothing -> toRawFilePath (reverse f)
#else #else
{- On Windows, count the number of bytes used by each utf8 character. -} {- On Windows, count the number of bytes used by each utf8 character. -}
truncateFilePath n = reverse . go [] n . L8.fromString truncateFilePath n = toRawFilePath . reverse . go [] n
where where
go coll cnt bs go coll cnt bs
| cnt <= 0 = coll | cnt <= 0 = coll
| otherwise = case L8.decode bs of | otherwise = case S8.decode bs of
Just (c, x) | c /= L8.replacement_char -> Just (c, x) | c /= S8.replacement_char ->
let x' = fromIntegral x let x' = fromIntegral x
in if cnt - x' < 0 in if cnt - x' < 0
then coll then coll
else go (c:coll) (cnt - x') (L8.drop 1 bs) else go (c:coll) (cnt - x') (S8.drop 1 bs)
_ -> coll _ -> coll
#endif #endif

View file

@ -112,8 +112,8 @@ relatedTemplate f
{- Some filesystems like FAT have issues with filenames {- Some filesystems like FAT have issues with filenames
- ending in ".", so avoid truncating a filename to end - ending in ".", so avoid truncating a filename to end
- that way. -} - that way. -}
toOsPath $ B.dropWhileEnd (== dot) $ toRawFilePath $ toOsPath $ B.dropWhileEnd (== dot) $
truncateFilePath (len - templateAddedLength) (fromRawFilePath f) truncateFilePath (len - templateAddedLength) f
| otherwise = toOsPath f | otherwise = toOsPath f
where where
len = B.length f len = B.length f