Some optimisations to string splitting code.

Turns out that Data.List.Utils.split is slow and makes a lot of
allocations. Here's a much simpler single character splitter that behaves
the same (even in wacky corner cases) while running in half the time and
75% the allocations.

As well as being an optimisation, this helps move toward eliminating use of
missingh.

(Data.List.Split.splitOn is nearly as slow as Data.List.Utils.split and
allocates even more.)

I have not benchmarked the effect on git-annex, but would not be surprised
to see some parsing of eg, large streams from git commands run twice as
fast, and possibly in less memory.

This commit was sponsored by Boyd Stephen Smith Jr. on Patreon.
This commit is contained in:
Joey Hess 2017-01-31 18:40:42 -04:00
parent dbaea98836
commit 9eb10caa27
No known key found for this signature in database
GPG key ID: C910D9222512E3C7
30 changed files with 47 additions and 38 deletions

View file

@ -441,7 +441,7 @@ setDirect wantdirect = do
- this way things that show HEAD (eg shell prompts) will
- hopefully show just "master". -}
directBranch :: Ref -> Ref
directBranch orighead = case split "/" $ fromRef orighead of
directBranch orighead = case splitc '/' $ fromRef orighead of
("refs":"heads":"annex":"direct":_) -> orighead
("refs":"heads":rest) ->
Ref $ "refs/heads/annex/direct/" ++ intercalate "/" rest
@ -452,7 +452,7 @@ directBranch orighead = case split "/" $ fromRef orighead of
- Any other ref is left unchanged.
-}
fromDirectBranch :: Ref -> Ref
fromDirectBranch directhead = case split "/" $ fromRef directhead of
fromDirectBranch directhead = case splitc '/' $ fromRef directhead of
("refs":"heads":"annex":"direct":rest) ->
Ref $ "refs/heads/" ++ intercalate "/" rest
_ -> directhead

View file

@ -39,7 +39,7 @@ toTaggedBranch u info b = Git.Ref $ intercalate "/" $ catMaybes
]
fromTaggedBranch :: Git.Branch -> Maybe (UUID, Maybe String)
fromTaggedBranch b = case split "/" $ Git.fromRef b of
fromTaggedBranch b = case splitc '/' $ Git.fromRef b of
("refs":"synced":u:info:_base) ->
Just (toUUID u, fromB64Maybe info)
("refs":"synced":u:_base) ->

View file

@ -383,7 +383,7 @@ mangleSshHostName sshdata = intercalate "-"
{- Extracts the real hostname from a mangled ssh hostname. -}
unMangleSshHostName :: String -> String
unMangleSshHostName h = case split "-" h of
unMangleSshHostName h = case splitc '-' h of
("git":"annex":rest) -> unescape (intercalate "-" (beginning rest))
_ -> h
where

View file

@ -103,7 +103,7 @@ selectExtension f
es = filter (not . null) $ reverse $
take 2 $ map (filter validInExtension) $
takeWhile shortenough $
reverse $ split "." $ takeExtensions f
reverse $ splitc '.' $ takeExtensions f
shortenough e = length e <= 4 -- long enough for "jpeg"
{- A key's checksum is checked during fsck. -}

View file

@ -95,7 +95,7 @@ findLibPath l = go =<< getEnv "DYLD_LIBRARY_PATH"
where
go Nothing = return l
go (Just p) = fromMaybe l
<$> firstM doesFileExist (map (</> f) (split ":" p))
<$> firstM doesFileExist (map (</> f) (splitc ':' p))
f = takeFileName l
{- Expands any @rpath in the list of libraries.

View file

@ -8,6 +8,7 @@ git-annex (6.20170102) UNRELEASED; urgency=medium
* vicfg: Include the numcopies configuation.
* config: New command for storing configuration in the git-annex branch.
* stack.yaml: Update to lts-7.18.
* Some optimisations to string splitting code.
-- Joey Hess <id@joeyh.name> Fri, 06 Jan 2017 15:22:06 -0400

View file

@ -387,7 +387,7 @@ url2file url pathdepth pathmax = case pathdepth of
]
frombits a = intercalate "/" $ a urlbits
urlbits = map (truncateFilePath pathmax . sanitizeFilePath) $
filter (not . null) $ split "/" fullurl
filter (not . null) $ splitc '/' fullurl
urlString2file :: URLString -> Maybe Int -> Int -> FilePath
urlString2file s pathdepth pathmax = case Url.parseURIRelaxed s of

View file

@ -92,7 +92,7 @@ hostname r
| otherwise = "localhost"
basehostname :: Git.Repo -> String
basehostname r = fromMaybe "" $ headMaybe $ split "." $ hostname r
basehostname r = fromMaybe "" $ headMaybe $ splitc '.' $ hostname r
{- A name to display for a repo. Uses the name from uuid.log if available,
- or the remote name if not. -}

View file

@ -269,7 +269,7 @@ withKeysReferencedDiff a getdiff extractsha = do
forM_ ds go
liftIO $ void clean
where
go d = do
go d = do
let sha = extractsha d
unless (sha == nullSha) $
(parseLinkOrPointer <$> catObject sha)

View file

@ -231,8 +231,8 @@ instance LensGpgEncParams (RemoteConfig, RemoteGitConfig) where
{- When the remote is configured to use public-key encryption,
- look up the recipient keys and add them to the option list. -}
case M.lookup "encryption" c of
Just "pubkey" -> Gpg.pkEncTo $ maybe [] (split ",") $ M.lookup "cipherkeys" c
Just "sharedpubkey" -> Gpg.pkEncTo $ maybe [] (split ",") $ M.lookup "pubkeys" c
Just "pubkey" -> Gpg.pkEncTo $ maybe [] (splitc ',') $ M.lookup "cipherkeys" c
Just "sharedpubkey" -> Gpg.pkEncTo $ maybe [] (splitc ',') $ M.lookup "pubkeys" c
_ -> []
getGpgDecParams (_c,gc) = map Param (remoteAnnexGnupgDecryptOptions gc)

View file

@ -91,16 +91,16 @@ pipeWrite params repo = withHandle StdinHandle createProcessSuccess $
pipeNullSplit :: [CommandParam] -> Repo -> IO ([String], IO Bool)
pipeNullSplit params repo = do
(s, cleanup) <- pipeReadLazy params repo
return (filter (not . null) $ split sep s, cleanup)
return (filter (not . null) $ splitc sep s, cleanup)
where
sep = "\0"
sep = '\0'
pipeNullSplitStrict :: [CommandParam] -> Repo -> IO [String]
pipeNullSplitStrict params repo = do
s <- pipeReadStrict params repo
return $ filter (not . null) $ split sep s
return $ filter (not . null) $ splitc sep s
where
sep = "\0"
sep = '\0'
pipeNullSplitZombie :: [CommandParam] -> Repo -> IO [String]
pipeNullSplitZombie params repo = leaveZombie <$> pipeNullSplit params repo

View file

@ -132,7 +132,7 @@ parse s
-- --list output will have an = in the first line
| all ('=' `elem`) (take 1 ls) = sep '=' ls
-- --null --list output separates keys from values with newlines
| otherwise = sep '\n' $ split "\0" s
| otherwise = sep '\n' $ splitc '\0' s
where
ls = lines s
sep c = M.fromListWith (++) . map (\(k,v) -> (k, [v])) .

View file

@ -26,7 +26,7 @@ module Git.Construct (
#ifndef mingw32_HOST_OS
import System.Posix.User
#endif
import qualified Data.Map as M hiding (map, split)
import qualified Data.Map as M
import Network.URI
import Common
@ -143,7 +143,7 @@ remoteNamedFromKey :: String -> IO Repo -> IO Repo
remoteNamedFromKey k = remoteNamed basename
where
basename = intercalate "." $
reverse $ drop 1 $ reverse $ drop 1 $ split "." k
reverse $ drop 1 $ reverse $ drop 1 $ splitc '.' k
{- Constructs a new Repo for one of a Repo's remotes using a given
- location (ie, an url). -}

View file

@ -144,6 +144,6 @@ legal allowonelevel s = all (== False) illegal
ends v = v `isSuffixOf` s
begins v = v `isPrefixOf` s
pathbits = split "/" s
pathbits = splitc '/' s
illegalchars = " ~^:?*[\\" ++ controlchars
controlchars = chr 0o177 : [chr 0 .. chr (0o40-1)]

View file

@ -161,7 +161,7 @@ addCopies :: String -> Annex ()
addCopies = addLimit . limitCopies
limitCopies :: MkLimit Annex
limitCopies want = case split ":" want of
limitCopies want = case splitc ':' want of
[v, n] -> case parsetrustspec v of
Just checker -> go n $ checktrust checker
Nothing -> go n $ checkgroup v

View file

@ -268,7 +268,7 @@ readTransferInfo mpid s = TransferInfo
filename
| end rest == "\n" = beginning rest
| otherwise = rest
bits = split " " firstline
bits = splitc ' ' firstline
numbits = length bits
time = if numbits > 0
then Just <$> parsePOSIXTime =<< headMaybe bits

View file

@ -66,7 +66,7 @@ fixBadUUID = M.fromList . map fixup . M.toList
newertime (LogEntry (Date d) _) = d + minimumPOSIXTimeSlice
newertime (LogEntry Unknown _) = minimumPOSIXTimeSlice
minimumPOSIXTimeSlice = 0.000001
isuuid s = length s == 36 && length (split "-" s) == 5
isuuid s = length s == 36 && length (splitc '-' s) == 5
{- Records the uuid in the log, if it's not already there. -}
recordUUID :: UUID -> Annex ()

View file

@ -140,7 +140,7 @@ byName' n = go . filter matching <$> remoteList
byNameOrGroup :: RemoteName -> Annex [Remote]
byNameOrGroup n = go =<< getConfigMaybe (ConfigKey ("remotes." ++ n))
where
go (Just l) = catMaybes <$> mapM (byName . Just) (split " " l)
go (Just l) = catMaybes <$> mapM (byName . Just) (splitc ' ' l)
go Nothing = maybeToList <$> byName (Just n)
{- Only matches remote name, not UUID -}

View file

@ -302,7 +302,7 @@ ariaProgress (Just sz) meter ps = do
=<< ariaParams ps
parseAriaProgress :: Integer -> ProgressParser
parseAriaProgress totalsize = go [] . reverse . split ['\r']
parseAriaProgress totalsize = go [] . reverse . splitc '\r'
where
go remainder [] = (Nothing, remainder)
go remainder (x:xs) = case readish (findpercent x) of

View file

@ -254,7 +254,7 @@ bup2GitRemote r
else giveup "please specify an absolute path"
| otherwise = Git.Construct.fromUrl $ "ssh://" ++ host ++ slash dir
where
bits = split ":" r
bits = splitc ':' r
host = Prelude.head bits
dir = intercalate ":" $ drop 1 bits
-- "host:~user/dir" is not supported specially by bup;

View file

@ -165,7 +165,7 @@ extractCipher c = case (M.lookup "cipher" c,
Just $ SharedCipher (fromB64bs t)
_ -> Nothing
where
readkeys = KeyIds . split ","
readkeys = KeyIds . splitc ','
describeEncryption :: RemoteConfig -> String
describeEncryption c = case extractCipher c of

View file

@ -25,7 +25,7 @@ allRefSpec :: RefSpec
allRefSpec = [AddMatching $ compileGlob "*" CaseSensative]
parseRefSpec :: String -> Either String RefSpec
parseRefSpec v = case partitionEithers (map mk $ split ":" v) of
parseRefSpec v = case partitionEithers (map mk $ splitc ':' v) of
([],refspec) -> Right refspec
(e:_,_) -> Left e
where

View file

@ -148,7 +148,7 @@ readKey1 v
, keyMtime = t
}
where
bits = split ":" v
bits = splitc ':' v
b = Prelude.head bits
n = intercalate ":" $ drop (if wormy then 3 else 1) bits
t = if wormy

View file

@ -25,7 +25,7 @@ instance Show DottedVersion where
normalize :: String -> DottedVersion
normalize v = DottedVersion v $
sum $ mult 1 $ reverse $ extend precision $ take precision $
map readi $ split "." v
map readi $ splitc '.' v
where
extend n l = l ++ replicate (n - length l) 0
mult _ [] = []

View file

@ -162,7 +162,7 @@ findPubKeys :: GpgCmd -> String -> IO KeyIds
findPubKeys cmd for = KeyIds . parse . lines <$> readStrict cmd params
where
params = [Param "--with-colons", Param "--list-public-keys", Param for]
parse = mapMaybe (keyIdField . split ":")
parse = mapMaybe (keyIdField . splitc ':')
keyIdField ("pub":_:_:_:f:_) = Just f
keyIdField _ = Nothing
@ -175,7 +175,7 @@ secretKeys cmd = catchDefaultIO M.empty makemap
where
makemap = M.fromList . parse . lines <$> readStrict cmd params
params = [Param "--with-colons", Param "--list-secret-keys", Param "--fixed-list-mode"]
parse = extract [] Nothing . map (split ":")
parse = extract [] Nothing . map (splitc ':')
extract c (Just keyid) (("uid":_:_:_:_:_:_:_:_:userid:_):rest) =
extract ((keyid, decode_c userid):c) Nothing rest
extract c (Just keyid) rest@(("sec":_):_) =

View file

@ -107,7 +107,7 @@ parseFormatted s = bundle $ go [] $ lines s
parsemode ('u':_) = OpenReadWrite
parsemode _ = OpenUnknown
splitnull = split "\0"
splitnull = splitc '\0'
parsefail = error $ "failed to parse lsof output: " ++ show s

View file

@ -45,6 +45,14 @@ separate c l = unbreak $ break c l
| null b = r
| otherwise = (a, tail b)
{- Split on a single character. This is over twice as fast as using
- Data.List.Utils.split on a list of length 1, while producing
- identical results. -}
splitc :: Char -> String -> [String]
splitc c s = case break (== c) s of
(i, _c:rest) -> i : splitc c rest
(i, []) -> i : []
{- Breaks out the first line. -}
firstLine :: String -> String
firstLine = takeWhile (/= '\n')

View file

@ -124,14 +124,14 @@ supported Quvi09 url = (firstlevel <&&> secondlevel)
Nothing -> return False
Just auth -> do
let domain = map toLower $ uriRegName auth
let basedomain = intercalate "." $ reverse $ take 2 $ reverse $ split "." domain
let basedomain = intercalate "." $ reverse $ take 2 $ reverse $ splitc '.' domain
any (\h -> domain `isSuffixOf` h || basedomain `isSuffixOf` h)
. map (map toLower) <$> listdomains Quvi09
secondlevel = snd <$> processTranscript "quvi"
(toCommand [Param "dump", Param "-o", Param url]) Nothing
listdomains :: QuviVersion -> IO [String]
listdomains Quvi09 = concatMap (split ",")
listdomains Quvi09 = concatMap (splitc ',')
. concatMap (drop 1 . words)
. filter ("domains: " `isPrefixOf`) . lines
<$> readQuvi (toCommand [Param "info", Param "-p", Param "domains"])

View file

@ -24,7 +24,7 @@ rsyncShell command = [Param "-e", Param $ unwords $ map escape (toCommand comman
{- rsync requires some weird, non-shell like quoting in
- here. A doubled single quote inside the single quoted
- string is a single quote. -}
escape s = "'" ++ intercalate "''" (split "'" s) ++ "'"
escape s = "'" ++ intercalate "''" (splitc '\'' s) ++ "'"
{- Runs rsync in server mode to send a file. -}
rsyncServerSend :: [CommandParam] -> FilePath -> IO Bool
@ -123,7 +123,7 @@ parseRsyncProgress = go [] . reverse . progresschunks
{- Find chunks that each start with delim.
- The first chunk doesn't start with it
- (it's empty when delim is at the start of the string). -}
progresschunks = drop 1 . split [delim]
progresschunks = drop 1 . splitc delim
findbytesstart s = dropWhile isSpace s
parsebytes :: String -> Maybe Integer

View file

@ -11,7 +11,7 @@ module Utility.SafeCommand where
import System.Exit
import Utility.Process
import Data.String.Utils
import Utility.Misc
import System.FilePath
import Data.Char
import Data.List
@ -86,7 +86,7 @@ shellEscape :: String -> String
shellEscape f = "'" ++ escaped ++ "'"
where
-- replace ' with '"'"'
escaped = intercalate "'\"'\"'" $ split "'" f
escaped = intercalate "'\"'\"'" $ splitc '\'' f
-- | Unescapes a set of shellEscaped words or filenames.
shellUnEscape :: String -> [String]