Some optimisations to string splitting code.

Turns out that Data.List.Utils.split is slow and makes a lot of allocations. Here's a much simpler single character splitter that behaves the same (even in wacky corner cases) while running in half the time and 75% the allocations. As well as being an optimisation, this helps move toward eliminating use of missingh. (Data.List.Split.splitOn is nearly as slow as Data.List.Utils.split and allocates even more.) I have not benchmarked the effect on git-annex, but would not be surprised to see some parsing of eg, large streams from git commands run twice as fast, and possibly in less memory. This commit was sponsored by Boyd Stephen Smith Jr. on Patreon.
2017-01-31 18:40:42 -04:00 · 2017-01-31 18:40:42 -04:00 · 9eb10caa27
commit 9eb10caa27
parent dbaea98836
30 changed files with 47 additions and 38 deletions
--- a/Utility/DottedVersion.hs
+++ b/Utility/DottedVersion.hs
@ -25,7 +25,7 @@ instance Show DottedVersion where
 normalize :: String -> DottedVersion
 normalize v = DottedVersion v $ 
 	sum $ mult 1 $ reverse $ extend precision $ take precision $
-		map readi $ split "." v
+		map readi $ splitc '.' v
  where
 	extend n l = l ++ replicate (n - length l) 0
 	mult _ [] = []
--- a/Utility/Gpg.hs
+++ b/Utility/Gpg.hs
@ -162,7 +162,7 @@ findPubKeys :: GpgCmd -> String -> IO KeyIds
 findPubKeys cmd for = KeyIds . parse . lines <$> readStrict cmd params
  where
 	params = [Param "--with-colons", Param "--list-public-keys", Param for]
-	parse = mapMaybe (keyIdField . split ":")
+	parse = mapMaybe (keyIdField . splitc ':')
 	keyIdField ("pub":_:_:_:f:_) = Just f
 	keyIdField _ = Nothing

@ -175,7 +175,7 @@ secretKeys cmd = catchDefaultIO M.empty makemap
  where
 	makemap = M.fromList . parse . lines <$> readStrict cmd params
 	params = [Param "--with-colons", Param "--list-secret-keys", Param "--fixed-list-mode"]
-	parse = extract [] Nothing . map (split ":")
+	parse = extract [] Nothing . map (splitc ':')
 	extract c (Just keyid) (("uid":_:_:_:_:_:_:_:_:userid:_):rest) =
 		extract ((keyid, decode_c userid):c) Nothing rest
 	extract c (Just keyid) rest@(("sec":_):_) =
--- a/Utility/Lsof.hs
+++ b/Utility/Lsof.hs
@ -107,7 +107,7 @@ parseFormatted s = bundle $ go [] $ lines s
 	parsemode ('u':_) = OpenReadWrite
 	parsemode _ = OpenUnknown

-	splitnull = split "\0"
+	splitnull = splitc '\0'

 	parsefail = error $ "failed to parse lsof output: " ++ show s

--- a/Utility/Misc.hs
+++ b/Utility/Misc.hs
@ -45,6 +45,14 @@ separate c l = unbreak $ break c l
 		| null b = r
 		| otherwise = (a, tail b)

+{- Split on a single character. This is over twice as fast as using
+ - Data.List.Utils.split on a list of length 1, while producing
+ - identical results. -}
+splitc :: Char -> String -> [String]
+splitc c s = case break (== c) s of
+	(i, _c:rest) -> i : splitc c rest
+	(i, []) -> i : []
+
 {- Breaks out the first line. -}
 firstLine :: String -> String
 firstLine = takeWhile (/= '\n')
--- a/Utility/Quvi.hs
+++ b/Utility/Quvi.hs
@ -124,14 +124,14 @@ supported Quvi09 url = (firstlevel <&&> secondlevel)
 		Nothing -> return False
 		Just auth -> do
 			let domain = map toLower $ uriRegName auth
-			let basedomain = intercalate "." $ reverse $ take 2 $ reverse $ split "." domain
+			let basedomain = intercalate "." $ reverse $ take 2 $ reverse $ splitc '.' domain
 			any (\h -> domain `isSuffixOf` h || basedomain `isSuffixOf` h) 
 				. map (map toLower) <$> listdomains Quvi09
 	secondlevel = snd <$> processTranscript "quvi"
 		(toCommand [Param "dump", Param "-o", Param url]) Nothing

 listdomains :: QuviVersion -> IO [String]
-listdomains Quvi09 = concatMap (split ",") 
+listdomains Quvi09 = concatMap (splitc ',') 
 	. concatMap (drop 1 . words) 
 	. filter ("domains: " `isPrefixOf`) . lines
 	<$> readQuvi (toCommand [Param "info", Param "-p", Param "domains"])
--- a/Utility/Rsync.hs
+++ b/Utility/Rsync.hs
@ -24,7 +24,7 @@ rsyncShell command = [Param "-e", Param $ unwords $ map escape (toCommand comman
 	{- rsync requires some weird, non-shell like quoting in
 	- here. A doubled single quote inside the single quoted
 	- string is a single quote. -}
-	escape s = "'" ++  intercalate "''" (split "'" s) ++ "'"
+	escape s = "'" ++  intercalate "''" (splitc '\'' s) ++ "'"

 {- Runs rsync in server mode to send a file. -}
 rsyncServerSend :: [CommandParam] -> FilePath -> IO Bool
@ -123,7 +123,7 @@ parseRsyncProgress = go [] . reverse . progresschunks
 	{- Find chunks that each start with delim.
 	 - The first chunk doesn't start with it
 	 - (it's empty when delim is at the start of the string). -}
-	progresschunks = drop 1 . split [delim]
+	progresschunks = drop 1 . splitc delim
 	findbytesstart s = dropWhile isSpace s

 	parsebytes :: String -> Maybe Integer
--- a/Utility/SafeCommand.hs
+++ b/Utility/SafeCommand.hs
@ -11,7 +11,7 @@ module Utility.SafeCommand where

 import System.Exit
 import Utility.Process
-import Data.String.Utils
+import Utility.Misc
 import System.FilePath
 import Data.Char
 import Data.List
@ -86,7 +86,7 @@ shellEscape :: String -> String
 shellEscape f = "'" ++ escaped ++ "'"
  where
 	-- replace ' with '"'"'
-	escaped = intercalate "'\"'\"'" $ split "'" f
+	escaped = intercalate "'\"'\"'" $ splitc '\'' f

 -- | Unescapes a set of shellEscaped words or filenames.
 shellUnEscape :: String -> [String]