Some optimisations to string splitting code.

Turns out that Data.List.Utils.split is slow and makes a lot of
allocations. Here's a much simpler single character splitter that behaves
the same (even in wacky corner cases) while running in half the time and
75% the allocations.

As well as being an optimisation, this helps move toward eliminating use of
missingh.

(Data.List.Split.splitOn is nearly as slow as Data.List.Utils.split and
allocates even more.)

I have not benchmarked the effect on git-annex, but would not be surprised
to see some parsing of eg, large streams from git commands run twice as
fast, and possibly in less memory.

This commit was sponsored by Boyd Stephen Smith Jr. on Patreon.
This commit is contained in:
Joey Hess 2017-01-31 18:40:42 -04:00
parent dbaea98836
commit 9eb10caa27
No known key found for this signature in database
GPG key ID: C910D9222512E3C7
30 changed files with 47 additions and 38 deletions

View file

@ -441,7 +441,7 @@ setDirect wantdirect = do
- this way things that show HEAD (eg shell prompts) will - this way things that show HEAD (eg shell prompts) will
- hopefully show just "master". -} - hopefully show just "master". -}
directBranch :: Ref -> Ref directBranch :: Ref -> Ref
directBranch orighead = case split "/" $ fromRef orighead of directBranch orighead = case splitc '/' $ fromRef orighead of
("refs":"heads":"annex":"direct":_) -> orighead ("refs":"heads":"annex":"direct":_) -> orighead
("refs":"heads":rest) -> ("refs":"heads":rest) ->
Ref $ "refs/heads/annex/direct/" ++ intercalate "/" rest Ref $ "refs/heads/annex/direct/" ++ intercalate "/" rest
@ -452,7 +452,7 @@ directBranch orighead = case split "/" $ fromRef orighead of
- Any other ref is left unchanged. - Any other ref is left unchanged.
-} -}
fromDirectBranch :: Ref -> Ref fromDirectBranch :: Ref -> Ref
fromDirectBranch directhead = case split "/" $ fromRef directhead of fromDirectBranch directhead = case splitc '/' $ fromRef directhead of
("refs":"heads":"annex":"direct":rest) -> ("refs":"heads":"annex":"direct":rest) ->
Ref $ "refs/heads/" ++ intercalate "/" rest Ref $ "refs/heads/" ++ intercalate "/" rest
_ -> directhead _ -> directhead

View file

@ -39,7 +39,7 @@ toTaggedBranch u info b = Git.Ref $ intercalate "/" $ catMaybes
] ]
fromTaggedBranch :: Git.Branch -> Maybe (UUID, Maybe String) fromTaggedBranch :: Git.Branch -> Maybe (UUID, Maybe String)
fromTaggedBranch b = case split "/" $ Git.fromRef b of fromTaggedBranch b = case splitc '/' $ Git.fromRef b of
("refs":"synced":u:info:_base) -> ("refs":"synced":u:info:_base) ->
Just (toUUID u, fromB64Maybe info) Just (toUUID u, fromB64Maybe info)
("refs":"synced":u:_base) -> ("refs":"synced":u:_base) ->

View file

@ -383,7 +383,7 @@ mangleSshHostName sshdata = intercalate "-"
{- Extracts the real hostname from a mangled ssh hostname. -} {- Extracts the real hostname from a mangled ssh hostname. -}
unMangleSshHostName :: String -> String unMangleSshHostName :: String -> String
unMangleSshHostName h = case split "-" h of unMangleSshHostName h = case splitc '-' h of
("git":"annex":rest) -> unescape (intercalate "-" (beginning rest)) ("git":"annex":rest) -> unescape (intercalate "-" (beginning rest))
_ -> h _ -> h
where where

View file

@ -103,7 +103,7 @@ selectExtension f
es = filter (not . null) $ reverse $ es = filter (not . null) $ reverse $
take 2 $ map (filter validInExtension) $ take 2 $ map (filter validInExtension) $
takeWhile shortenough $ takeWhile shortenough $
reverse $ split "." $ takeExtensions f reverse $ splitc '.' $ takeExtensions f
shortenough e = length e <= 4 -- long enough for "jpeg" shortenough e = length e <= 4 -- long enough for "jpeg"
{- A key's checksum is checked during fsck. -} {- A key's checksum is checked during fsck. -}

View file

@ -95,7 +95,7 @@ findLibPath l = go =<< getEnv "DYLD_LIBRARY_PATH"
where where
go Nothing = return l go Nothing = return l
go (Just p) = fromMaybe l go (Just p) = fromMaybe l
<$> firstM doesFileExist (map (</> f) (split ":" p)) <$> firstM doesFileExist (map (</> f) (splitc ':' p))
f = takeFileName l f = takeFileName l
{- Expands any @rpath in the list of libraries. {- Expands any @rpath in the list of libraries.

View file

@ -8,6 +8,7 @@ git-annex (6.20170102) UNRELEASED; urgency=medium
* vicfg: Include the numcopies configuation. * vicfg: Include the numcopies configuation.
* config: New command for storing configuration in the git-annex branch. * config: New command for storing configuration in the git-annex branch.
* stack.yaml: Update to lts-7.18. * stack.yaml: Update to lts-7.18.
* Some optimisations to string splitting code.
-- Joey Hess <id@joeyh.name> Fri, 06 Jan 2017 15:22:06 -0400 -- Joey Hess <id@joeyh.name> Fri, 06 Jan 2017 15:22:06 -0400

View file

@ -387,7 +387,7 @@ url2file url pathdepth pathmax = case pathdepth of
] ]
frombits a = intercalate "/" $ a urlbits frombits a = intercalate "/" $ a urlbits
urlbits = map (truncateFilePath pathmax . sanitizeFilePath) $ urlbits = map (truncateFilePath pathmax . sanitizeFilePath) $
filter (not . null) $ split "/" fullurl filter (not . null) $ splitc '/' fullurl
urlString2file :: URLString -> Maybe Int -> Int -> FilePath urlString2file :: URLString -> Maybe Int -> Int -> FilePath
urlString2file s pathdepth pathmax = case Url.parseURIRelaxed s of urlString2file s pathdepth pathmax = case Url.parseURIRelaxed s of

View file

@ -92,7 +92,7 @@ hostname r
| otherwise = "localhost" | otherwise = "localhost"
basehostname :: Git.Repo -> String basehostname :: Git.Repo -> String
basehostname r = fromMaybe "" $ headMaybe $ split "." $ hostname r basehostname r = fromMaybe "" $ headMaybe $ splitc '.' $ hostname r
{- A name to display for a repo. Uses the name from uuid.log if available, {- A name to display for a repo. Uses the name from uuid.log if available,
- or the remote name if not. -} - or the remote name if not. -}

View file

@ -231,8 +231,8 @@ instance LensGpgEncParams (RemoteConfig, RemoteGitConfig) where
{- When the remote is configured to use public-key encryption, {- When the remote is configured to use public-key encryption,
- look up the recipient keys and add them to the option list. -} - look up the recipient keys and add them to the option list. -}
case M.lookup "encryption" c of case M.lookup "encryption" c of
Just "pubkey" -> Gpg.pkEncTo $ maybe [] (split ",") $ M.lookup "cipherkeys" c Just "pubkey" -> Gpg.pkEncTo $ maybe [] (splitc ',') $ M.lookup "cipherkeys" c
Just "sharedpubkey" -> Gpg.pkEncTo $ maybe [] (split ",") $ M.lookup "pubkeys" c Just "sharedpubkey" -> Gpg.pkEncTo $ maybe [] (splitc ',') $ M.lookup "pubkeys" c
_ -> [] _ -> []
getGpgDecParams (_c,gc) = map Param (remoteAnnexGnupgDecryptOptions gc) getGpgDecParams (_c,gc) = map Param (remoteAnnexGnupgDecryptOptions gc)

View file

@ -91,16 +91,16 @@ pipeWrite params repo = withHandle StdinHandle createProcessSuccess $
pipeNullSplit :: [CommandParam] -> Repo -> IO ([String], IO Bool) pipeNullSplit :: [CommandParam] -> Repo -> IO ([String], IO Bool)
pipeNullSplit params repo = do pipeNullSplit params repo = do
(s, cleanup) <- pipeReadLazy params repo (s, cleanup) <- pipeReadLazy params repo
return (filter (not . null) $ split sep s, cleanup) return (filter (not . null) $ splitc sep s, cleanup)
where where
sep = "\0" sep = '\0'
pipeNullSplitStrict :: [CommandParam] -> Repo -> IO [String] pipeNullSplitStrict :: [CommandParam] -> Repo -> IO [String]
pipeNullSplitStrict params repo = do pipeNullSplitStrict params repo = do
s <- pipeReadStrict params repo s <- pipeReadStrict params repo
return $ filter (not . null) $ split sep s return $ filter (not . null) $ splitc sep s
where where
sep = "\0" sep = '\0'
pipeNullSplitZombie :: [CommandParam] -> Repo -> IO [String] pipeNullSplitZombie :: [CommandParam] -> Repo -> IO [String]
pipeNullSplitZombie params repo = leaveZombie <$> pipeNullSplit params repo pipeNullSplitZombie params repo = leaveZombie <$> pipeNullSplit params repo

View file

@ -132,7 +132,7 @@ parse s
-- --list output will have an = in the first line -- --list output will have an = in the first line
| all ('=' `elem`) (take 1 ls) = sep '=' ls | all ('=' `elem`) (take 1 ls) = sep '=' ls
-- --null --list output separates keys from values with newlines -- --null --list output separates keys from values with newlines
| otherwise = sep '\n' $ split "\0" s | otherwise = sep '\n' $ splitc '\0' s
where where
ls = lines s ls = lines s
sep c = M.fromListWith (++) . map (\(k,v) -> (k, [v])) . sep c = M.fromListWith (++) . map (\(k,v) -> (k, [v])) .

View file

@ -26,7 +26,7 @@ module Git.Construct (
#ifndef mingw32_HOST_OS #ifndef mingw32_HOST_OS
import System.Posix.User import System.Posix.User
#endif #endif
import qualified Data.Map as M hiding (map, split) import qualified Data.Map as M
import Network.URI import Network.URI
import Common import Common
@ -143,7 +143,7 @@ remoteNamedFromKey :: String -> IO Repo -> IO Repo
remoteNamedFromKey k = remoteNamed basename remoteNamedFromKey k = remoteNamed basename
where where
basename = intercalate "." $ basename = intercalate "." $
reverse $ drop 1 $ reverse $ drop 1 $ split "." k reverse $ drop 1 $ reverse $ drop 1 $ splitc '.' k
{- Constructs a new Repo for one of a Repo's remotes using a given {- Constructs a new Repo for one of a Repo's remotes using a given
- location (ie, an url). -} - location (ie, an url). -}

View file

@ -144,6 +144,6 @@ legal allowonelevel s = all (== False) illegal
ends v = v `isSuffixOf` s ends v = v `isSuffixOf` s
begins v = v `isPrefixOf` s begins v = v `isPrefixOf` s
pathbits = split "/" s pathbits = splitc '/' s
illegalchars = " ~^:?*[\\" ++ controlchars illegalchars = " ~^:?*[\\" ++ controlchars
controlchars = chr 0o177 : [chr 0 .. chr (0o40-1)] controlchars = chr 0o177 : [chr 0 .. chr (0o40-1)]

View file

@ -161,7 +161,7 @@ addCopies :: String -> Annex ()
addCopies = addLimit . limitCopies addCopies = addLimit . limitCopies
limitCopies :: MkLimit Annex limitCopies :: MkLimit Annex
limitCopies want = case split ":" want of limitCopies want = case splitc ':' want of
[v, n] -> case parsetrustspec v of [v, n] -> case parsetrustspec v of
Just checker -> go n $ checktrust checker Just checker -> go n $ checktrust checker
Nothing -> go n $ checkgroup v Nothing -> go n $ checkgroup v

View file

@ -268,7 +268,7 @@ readTransferInfo mpid s = TransferInfo
filename filename
| end rest == "\n" = beginning rest | end rest == "\n" = beginning rest
| otherwise = rest | otherwise = rest
bits = split " " firstline bits = splitc ' ' firstline
numbits = length bits numbits = length bits
time = if numbits > 0 time = if numbits > 0
then Just <$> parsePOSIXTime =<< headMaybe bits then Just <$> parsePOSIXTime =<< headMaybe bits

View file

@ -66,7 +66,7 @@ fixBadUUID = M.fromList . map fixup . M.toList
newertime (LogEntry (Date d) _) = d + minimumPOSIXTimeSlice newertime (LogEntry (Date d) _) = d + minimumPOSIXTimeSlice
newertime (LogEntry Unknown _) = minimumPOSIXTimeSlice newertime (LogEntry Unknown _) = minimumPOSIXTimeSlice
minimumPOSIXTimeSlice = 0.000001 minimumPOSIXTimeSlice = 0.000001
isuuid s = length s == 36 && length (split "-" s) == 5 isuuid s = length s == 36 && length (splitc '-' s) == 5
{- Records the uuid in the log, if it's not already there. -} {- Records the uuid in the log, if it's not already there. -}
recordUUID :: UUID -> Annex () recordUUID :: UUID -> Annex ()

View file

@ -140,7 +140,7 @@ byName' n = go . filter matching <$> remoteList
byNameOrGroup :: RemoteName -> Annex [Remote] byNameOrGroup :: RemoteName -> Annex [Remote]
byNameOrGroup n = go =<< getConfigMaybe (ConfigKey ("remotes." ++ n)) byNameOrGroup n = go =<< getConfigMaybe (ConfigKey ("remotes." ++ n))
where where
go (Just l) = catMaybes <$> mapM (byName . Just) (split " " l) go (Just l) = catMaybes <$> mapM (byName . Just) (splitc ' ' l)
go Nothing = maybeToList <$> byName (Just n) go Nothing = maybeToList <$> byName (Just n)
{- Only matches remote name, not UUID -} {- Only matches remote name, not UUID -}

View file

@ -302,7 +302,7 @@ ariaProgress (Just sz) meter ps = do
=<< ariaParams ps =<< ariaParams ps
parseAriaProgress :: Integer -> ProgressParser parseAriaProgress :: Integer -> ProgressParser
parseAriaProgress totalsize = go [] . reverse . split ['\r'] parseAriaProgress totalsize = go [] . reverse . splitc '\r'
where where
go remainder [] = (Nothing, remainder) go remainder [] = (Nothing, remainder)
go remainder (x:xs) = case readish (findpercent x) of go remainder (x:xs) = case readish (findpercent x) of

View file

@ -254,7 +254,7 @@ bup2GitRemote r
else giveup "please specify an absolute path" else giveup "please specify an absolute path"
| otherwise = Git.Construct.fromUrl $ "ssh://" ++ host ++ slash dir | otherwise = Git.Construct.fromUrl $ "ssh://" ++ host ++ slash dir
where where
bits = split ":" r bits = splitc ':' r
host = Prelude.head bits host = Prelude.head bits
dir = intercalate ":" $ drop 1 bits dir = intercalate ":" $ drop 1 bits
-- "host:~user/dir" is not supported specially by bup; -- "host:~user/dir" is not supported specially by bup;

View file

@ -165,7 +165,7 @@ extractCipher c = case (M.lookup "cipher" c,
Just $ SharedCipher (fromB64bs t) Just $ SharedCipher (fromB64bs t)
_ -> Nothing _ -> Nothing
where where
readkeys = KeyIds . split "," readkeys = KeyIds . splitc ','
describeEncryption :: RemoteConfig -> String describeEncryption :: RemoteConfig -> String
describeEncryption c = case extractCipher c of describeEncryption c = case extractCipher c of

View file

@ -25,7 +25,7 @@ allRefSpec :: RefSpec
allRefSpec = [AddMatching $ compileGlob "*" CaseSensative] allRefSpec = [AddMatching $ compileGlob "*" CaseSensative]
parseRefSpec :: String -> Either String RefSpec parseRefSpec :: String -> Either String RefSpec
parseRefSpec v = case partitionEithers (map mk $ split ":" v) of parseRefSpec v = case partitionEithers (map mk $ splitc ':' v) of
([],refspec) -> Right refspec ([],refspec) -> Right refspec
(e:_,_) -> Left e (e:_,_) -> Left e
where where

View file

@ -148,7 +148,7 @@ readKey1 v
, keyMtime = t , keyMtime = t
} }
where where
bits = split ":" v bits = splitc ':' v
b = Prelude.head bits b = Prelude.head bits
n = intercalate ":" $ drop (if wormy then 3 else 1) bits n = intercalate ":" $ drop (if wormy then 3 else 1) bits
t = if wormy t = if wormy

View file

@ -25,7 +25,7 @@ instance Show DottedVersion where
normalize :: String -> DottedVersion normalize :: String -> DottedVersion
normalize v = DottedVersion v $ normalize v = DottedVersion v $
sum $ mult 1 $ reverse $ extend precision $ take precision $ sum $ mult 1 $ reverse $ extend precision $ take precision $
map readi $ split "." v map readi $ splitc '.' v
where where
extend n l = l ++ replicate (n - length l) 0 extend n l = l ++ replicate (n - length l) 0
mult _ [] = [] mult _ [] = []

View file

@ -162,7 +162,7 @@ findPubKeys :: GpgCmd -> String -> IO KeyIds
findPubKeys cmd for = KeyIds . parse . lines <$> readStrict cmd params findPubKeys cmd for = KeyIds . parse . lines <$> readStrict cmd params
where where
params = [Param "--with-colons", Param "--list-public-keys", Param for] params = [Param "--with-colons", Param "--list-public-keys", Param for]
parse = mapMaybe (keyIdField . split ":") parse = mapMaybe (keyIdField . splitc ':')
keyIdField ("pub":_:_:_:f:_) = Just f keyIdField ("pub":_:_:_:f:_) = Just f
keyIdField _ = Nothing keyIdField _ = Nothing
@ -175,7 +175,7 @@ secretKeys cmd = catchDefaultIO M.empty makemap
where where
makemap = M.fromList . parse . lines <$> readStrict cmd params makemap = M.fromList . parse . lines <$> readStrict cmd params
params = [Param "--with-colons", Param "--list-secret-keys", Param "--fixed-list-mode"] params = [Param "--with-colons", Param "--list-secret-keys", Param "--fixed-list-mode"]
parse = extract [] Nothing . map (split ":") parse = extract [] Nothing . map (splitc ':')
extract c (Just keyid) (("uid":_:_:_:_:_:_:_:_:userid:_):rest) = extract c (Just keyid) (("uid":_:_:_:_:_:_:_:_:userid:_):rest) =
extract ((keyid, decode_c userid):c) Nothing rest extract ((keyid, decode_c userid):c) Nothing rest
extract c (Just keyid) rest@(("sec":_):_) = extract c (Just keyid) rest@(("sec":_):_) =

View file

@ -107,7 +107,7 @@ parseFormatted s = bundle $ go [] $ lines s
parsemode ('u':_) = OpenReadWrite parsemode ('u':_) = OpenReadWrite
parsemode _ = OpenUnknown parsemode _ = OpenUnknown
splitnull = split "\0" splitnull = splitc '\0'
parsefail = error $ "failed to parse lsof output: " ++ show s parsefail = error $ "failed to parse lsof output: " ++ show s

View file

@ -45,6 +45,14 @@ separate c l = unbreak $ break c l
| null b = r | null b = r
| otherwise = (a, tail b) | otherwise = (a, tail b)
{- Split on a single character. This is over twice as fast as using
- Data.List.Utils.split on a list of length 1, while producing
- identical results. -}
splitc :: Char -> String -> [String]
splitc c s = case break (== c) s of
(i, _c:rest) -> i : splitc c rest
(i, []) -> i : []
{- Breaks out the first line. -} {- Breaks out the first line. -}
firstLine :: String -> String firstLine :: String -> String
firstLine = takeWhile (/= '\n') firstLine = takeWhile (/= '\n')

View file

@ -124,14 +124,14 @@ supported Quvi09 url = (firstlevel <&&> secondlevel)
Nothing -> return False Nothing -> return False
Just auth -> do Just auth -> do
let domain = map toLower $ uriRegName auth let domain = map toLower $ uriRegName auth
let basedomain = intercalate "." $ reverse $ take 2 $ reverse $ split "." domain let basedomain = intercalate "." $ reverse $ take 2 $ reverse $ splitc '.' domain
any (\h -> domain `isSuffixOf` h || basedomain `isSuffixOf` h) any (\h -> domain `isSuffixOf` h || basedomain `isSuffixOf` h)
. map (map toLower) <$> listdomains Quvi09 . map (map toLower) <$> listdomains Quvi09
secondlevel = snd <$> processTranscript "quvi" secondlevel = snd <$> processTranscript "quvi"
(toCommand [Param "dump", Param "-o", Param url]) Nothing (toCommand [Param "dump", Param "-o", Param url]) Nothing
listdomains :: QuviVersion -> IO [String] listdomains :: QuviVersion -> IO [String]
listdomains Quvi09 = concatMap (split ",") listdomains Quvi09 = concatMap (splitc ',')
. concatMap (drop 1 . words) . concatMap (drop 1 . words)
. filter ("domains: " `isPrefixOf`) . lines . filter ("domains: " `isPrefixOf`) . lines
<$> readQuvi (toCommand [Param "info", Param "-p", Param "domains"]) <$> readQuvi (toCommand [Param "info", Param "-p", Param "domains"])

View file

@ -24,7 +24,7 @@ rsyncShell command = [Param "-e", Param $ unwords $ map escape (toCommand comman
{- rsync requires some weird, non-shell like quoting in {- rsync requires some weird, non-shell like quoting in
- here. A doubled single quote inside the single quoted - here. A doubled single quote inside the single quoted
- string is a single quote. -} - string is a single quote. -}
escape s = "'" ++ intercalate "''" (split "'" s) ++ "'" escape s = "'" ++ intercalate "''" (splitc '\'' s) ++ "'"
{- Runs rsync in server mode to send a file. -} {- Runs rsync in server mode to send a file. -}
rsyncServerSend :: [CommandParam] -> FilePath -> IO Bool rsyncServerSend :: [CommandParam] -> FilePath -> IO Bool
@ -123,7 +123,7 @@ parseRsyncProgress = go [] . reverse . progresschunks
{- Find chunks that each start with delim. {- Find chunks that each start with delim.
- The first chunk doesn't start with it - The first chunk doesn't start with it
- (it's empty when delim is at the start of the string). -} - (it's empty when delim is at the start of the string). -}
progresschunks = drop 1 . split [delim] progresschunks = drop 1 . splitc delim
findbytesstart s = dropWhile isSpace s findbytesstart s = dropWhile isSpace s
parsebytes :: String -> Maybe Integer parsebytes :: String -> Maybe Integer

View file

@ -11,7 +11,7 @@ module Utility.SafeCommand where
import System.Exit import System.Exit
import Utility.Process import Utility.Process
import Data.String.Utils import Utility.Misc
import System.FilePath import System.FilePath
import Data.Char import Data.Char
import Data.List import Data.List
@ -86,7 +86,7 @@ shellEscape :: String -> String
shellEscape f = "'" ++ escaped ++ "'" shellEscape f = "'" ++ escaped ++ "'"
where where
-- replace ' with '"'"' -- replace ' with '"'"'
escaped = intercalate "'\"'\"'" $ split "'" f escaped = intercalate "'\"'\"'" $ splitc '\'' f
-- | Unescapes a set of shellEscaped words or filenames. -- | Unescapes a set of shellEscaped words or filenames.
shellUnEscape :: String -> [String] shellUnEscape :: String -> [String]