From 9eb10caa27e577c854e4cd74db5c7a4e9fed2c92 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 31 Jan 2017 18:40:42 -0400 Subject: [PATCH] Some optimisations to string splitting code. Turns out that Data.List.Utils.split is slow and makes a lot of allocations. Here's a much simpler single character splitter that behaves the same (even in wacky corner cases) while running in half the time and 75% the allocations. As well as being an optimisation, this helps move toward eliminating use of missingh. (Data.List.Split.splitOn is nearly as slow as Data.List.Utils.split and allocates even more.) I have not benchmarked the effect on git-annex, but would not be surprised to see some parsing of eg, large streams from git commands run twice as fast, and possibly in less memory. This commit was sponsored by Boyd Stephen Smith Jr. on Patreon. --- Annex/Direct.hs | 4 ++-- Annex/TaggedPush.hs | 2 +- Assistant/Ssh.hs | 2 +- Backend/Hash.hs | 2 +- Build/OSXMkLibs.hs | 2 +- CHANGELOG | 1 + Command/AddUrl.hs | 2 +- Command/Map.hs | 2 +- Command/Unused.hs | 2 +- Crypto.hs | 4 ++-- Git/Command.hs | 8 ++++---- Git/Config.hs | 2 +- Git/Construct.hs | 4 ++-- Git/Ref.hs | 2 +- Limit.hs | 2 +- Logs/Transfer.hs | 2 +- Logs/UUID.hs | 2 +- Remote.hs | 2 +- Remote/BitTorrent.hs | 2 +- Remote/Bup.hs | 2 +- Remote/Helper/Encryptable.hs | 2 +- Types/RefSpec.hs | 2 +- Upgrade/V1.hs | 2 +- Utility/DottedVersion.hs | 2 +- Utility/Gpg.hs | 4 ++-- Utility/Lsof.hs | 2 +- Utility/Misc.hs | 8 ++++++++ Utility/Quvi.hs | 4 ++-- Utility/Rsync.hs | 4 ++-- Utility/SafeCommand.hs | 4 ++-- 30 files changed, 47 insertions(+), 38 deletions(-) diff --git a/Annex/Direct.hs b/Annex/Direct.hs index 5724d11621..e5c1c47c82 100644 --- a/Annex/Direct.hs +++ b/Annex/Direct.hs @@ -441,7 +441,7 @@ setDirect wantdirect = do - this way things that show HEAD (eg shell prompts) will - hopefully show just "master". -} directBranch :: Ref -> Ref -directBranch orighead = case split "/" $ fromRef orighead of +directBranch orighead = case splitc '/' $ fromRef orighead of ("refs":"heads":"annex":"direct":_) -> orighead ("refs":"heads":rest) -> Ref $ "refs/heads/annex/direct/" ++ intercalate "/" rest @@ -452,7 +452,7 @@ directBranch orighead = case split "/" $ fromRef orighead of - Any other ref is left unchanged. -} fromDirectBranch :: Ref -> Ref -fromDirectBranch directhead = case split "/" $ fromRef directhead of +fromDirectBranch directhead = case splitc '/' $ fromRef directhead of ("refs":"heads":"annex":"direct":rest) -> Ref $ "refs/heads/" ++ intercalate "/" rest _ -> directhead diff --git a/Annex/TaggedPush.hs b/Annex/TaggedPush.hs index ef1aeeea2d..3b06170b31 100644 --- a/Annex/TaggedPush.hs +++ b/Annex/TaggedPush.hs @@ -39,7 +39,7 @@ toTaggedBranch u info b = Git.Ref $ intercalate "/" $ catMaybes ] fromTaggedBranch :: Git.Branch -> Maybe (UUID, Maybe String) -fromTaggedBranch b = case split "/" $ Git.fromRef b of +fromTaggedBranch b = case splitc '/' $ Git.fromRef b of ("refs":"synced":u:info:_base) -> Just (toUUID u, fromB64Maybe info) ("refs":"synced":u:_base) -> diff --git a/Assistant/Ssh.hs b/Assistant/Ssh.hs index 66ed542570..e439ecd231 100644 --- a/Assistant/Ssh.hs +++ b/Assistant/Ssh.hs @@ -383,7 +383,7 @@ mangleSshHostName sshdata = intercalate "-" {- Extracts the real hostname from a mangled ssh hostname. -} unMangleSshHostName :: String -> String -unMangleSshHostName h = case split "-" h of +unMangleSshHostName h = case splitc '-' h of ("git":"annex":rest) -> unescape (intercalate "-" (beginning rest)) _ -> h where diff --git a/Backend/Hash.hs b/Backend/Hash.hs index ba8d4bc641..c85047d517 100644 --- a/Backend/Hash.hs +++ b/Backend/Hash.hs @@ -103,7 +103,7 @@ selectExtension f es = filter (not . null) $ reverse $ take 2 $ map (filter validInExtension) $ takeWhile shortenough $ - reverse $ split "." $ takeExtensions f + reverse $ splitc '.' $ takeExtensions f shortenough e = length e <= 4 -- long enough for "jpeg" {- A key's checksum is checked during fsck. -} diff --git a/Build/OSXMkLibs.hs b/Build/OSXMkLibs.hs index 948b0d5a35..2243c6b862 100644 --- a/Build/OSXMkLibs.hs +++ b/Build/OSXMkLibs.hs @@ -95,7 +95,7 @@ findLibPath l = go =<< getEnv "DYLD_LIBRARY_PATH" where go Nothing = return l go (Just p) = fromMaybe l - <$> firstM doesFileExist (map ( f) (split ":" p)) + <$> firstM doesFileExist (map ( f) (splitc ':' p)) f = takeFileName l {- Expands any @rpath in the list of libraries. diff --git a/CHANGELOG b/CHANGELOG index 30b7b147b8..634b22082f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -8,6 +8,7 @@ git-annex (6.20170102) UNRELEASED; urgency=medium * vicfg: Include the numcopies configuation. * config: New command for storing configuration in the git-annex branch. * stack.yaml: Update to lts-7.18. + * Some optimisations to string splitting code. -- Joey Hess Fri, 06 Jan 2017 15:22:06 -0400 diff --git a/Command/AddUrl.hs b/Command/AddUrl.hs index 8cc1484406..169875f4bf 100644 --- a/Command/AddUrl.hs +++ b/Command/AddUrl.hs @@ -387,7 +387,7 @@ url2file url pathdepth pathmax = case pathdepth of ] frombits a = intercalate "/" $ a urlbits urlbits = map (truncateFilePath pathmax . sanitizeFilePath) $ - filter (not . null) $ split "/" fullurl + filter (not . null) $ splitc '/' fullurl urlString2file :: URLString -> Maybe Int -> Int -> FilePath urlString2file s pathdepth pathmax = case Url.parseURIRelaxed s of diff --git a/Command/Map.hs b/Command/Map.hs index 43c00d2572..b04beb477c 100644 --- a/Command/Map.hs +++ b/Command/Map.hs @@ -92,7 +92,7 @@ hostname r | otherwise = "localhost" basehostname :: Git.Repo -> String -basehostname r = fromMaybe "" $ headMaybe $ split "." $ hostname r +basehostname r = fromMaybe "" $ headMaybe $ splitc '.' $ hostname r {- A name to display for a repo. Uses the name from uuid.log if available, - or the remote name if not. -} diff --git a/Command/Unused.hs b/Command/Unused.hs index 1711fe047c..3953f44861 100644 --- a/Command/Unused.hs +++ b/Command/Unused.hs @@ -269,7 +269,7 @@ withKeysReferencedDiff a getdiff extractsha = do forM_ ds go liftIO $ void clean where - go d = do + go d = do let sha = extractsha d unless (sha == nullSha) $ (parseLinkOrPointer <$> catObject sha) diff --git a/Crypto.hs b/Crypto.hs index d3cbfa2f7f..dc1d2e6d24 100644 --- a/Crypto.hs +++ b/Crypto.hs @@ -231,8 +231,8 @@ instance LensGpgEncParams (RemoteConfig, RemoteGitConfig) where {- When the remote is configured to use public-key encryption, - look up the recipient keys and add them to the option list. -} case M.lookup "encryption" c of - Just "pubkey" -> Gpg.pkEncTo $ maybe [] (split ",") $ M.lookup "cipherkeys" c - Just "sharedpubkey" -> Gpg.pkEncTo $ maybe [] (split ",") $ M.lookup "pubkeys" c + Just "pubkey" -> Gpg.pkEncTo $ maybe [] (splitc ',') $ M.lookup "cipherkeys" c + Just "sharedpubkey" -> Gpg.pkEncTo $ maybe [] (splitc ',') $ M.lookup "pubkeys" c _ -> [] getGpgDecParams (_c,gc) = map Param (remoteAnnexGnupgDecryptOptions gc) diff --git a/Git/Command.hs b/Git/Command.hs index adea7622e0..f40dfabcd8 100644 --- a/Git/Command.hs +++ b/Git/Command.hs @@ -91,16 +91,16 @@ pipeWrite params repo = withHandle StdinHandle createProcessSuccess $ pipeNullSplit :: [CommandParam] -> Repo -> IO ([String], IO Bool) pipeNullSplit params repo = do (s, cleanup) <- pipeReadLazy params repo - return (filter (not . null) $ split sep s, cleanup) + return (filter (not . null) $ splitc sep s, cleanup) where - sep = "\0" + sep = '\0' pipeNullSplitStrict :: [CommandParam] -> Repo -> IO [String] pipeNullSplitStrict params repo = do s <- pipeReadStrict params repo - return $ filter (not . null) $ split sep s + return $ filter (not . null) $ splitc sep s where - sep = "\0" + sep = '\0' pipeNullSplitZombie :: [CommandParam] -> Repo -> IO [String] pipeNullSplitZombie params repo = leaveZombie <$> pipeNullSplit params repo diff --git a/Git/Config.hs b/Git/Config.hs index 65bd9b7ba3..9b4c342a4b 100644 --- a/Git/Config.hs +++ b/Git/Config.hs @@ -132,7 +132,7 @@ parse s -- --list output will have an = in the first line | all ('=' `elem`) (take 1 ls) = sep '=' ls -- --null --list output separates keys from values with newlines - | otherwise = sep '\n' $ split "\0" s + | otherwise = sep '\n' $ splitc '\0' s where ls = lines s sep c = M.fromListWith (++) . map (\(k,v) -> (k, [v])) . diff --git a/Git/Construct.hs b/Git/Construct.hs index 765562212e..4899278805 100644 --- a/Git/Construct.hs +++ b/Git/Construct.hs @@ -26,7 +26,7 @@ module Git.Construct ( #ifndef mingw32_HOST_OS import System.Posix.User #endif -import qualified Data.Map as M hiding (map, split) +import qualified Data.Map as M import Network.URI import Common @@ -143,7 +143,7 @@ remoteNamedFromKey :: String -> IO Repo -> IO Repo remoteNamedFromKey k = remoteNamed basename where basename = intercalate "." $ - reverse $ drop 1 $ reverse $ drop 1 $ split "." k + reverse $ drop 1 $ reverse $ drop 1 $ splitc '.' k {- Constructs a new Repo for one of a Repo's remotes using a given - location (ie, an url). -} diff --git a/Git/Ref.hs b/Git/Ref.hs index 5b3b85324c..2d80137387 100644 --- a/Git/Ref.hs +++ b/Git/Ref.hs @@ -144,6 +144,6 @@ legal allowonelevel s = all (== False) illegal ends v = v `isSuffixOf` s begins v = v `isPrefixOf` s - pathbits = split "/" s + pathbits = splitc '/' s illegalchars = " ~^:?*[\\" ++ controlchars controlchars = chr 0o177 : [chr 0 .. chr (0o40-1)] diff --git a/Limit.hs b/Limit.hs index efe4fea85b..7b26f9e588 100644 --- a/Limit.hs +++ b/Limit.hs @@ -161,7 +161,7 @@ addCopies :: String -> Annex () addCopies = addLimit . limitCopies limitCopies :: MkLimit Annex -limitCopies want = case split ":" want of +limitCopies want = case splitc ':' want of [v, n] -> case parsetrustspec v of Just checker -> go n $ checktrust checker Nothing -> go n $ checkgroup v diff --git a/Logs/Transfer.hs b/Logs/Transfer.hs index 28f7b0a263..903db96fe4 100644 --- a/Logs/Transfer.hs +++ b/Logs/Transfer.hs @@ -268,7 +268,7 @@ readTransferInfo mpid s = TransferInfo filename | end rest == "\n" = beginning rest | otherwise = rest - bits = split " " firstline + bits = splitc ' ' firstline numbits = length bits time = if numbits > 0 then Just <$> parsePOSIXTime =<< headMaybe bits diff --git a/Logs/UUID.hs b/Logs/UUID.hs index 60c8a2ef95..4c84d10bd8 100644 --- a/Logs/UUID.hs +++ b/Logs/UUID.hs @@ -66,7 +66,7 @@ fixBadUUID = M.fromList . map fixup . M.toList newertime (LogEntry (Date d) _) = d + minimumPOSIXTimeSlice newertime (LogEntry Unknown _) = minimumPOSIXTimeSlice minimumPOSIXTimeSlice = 0.000001 - isuuid s = length s == 36 && length (split "-" s) == 5 + isuuid s = length s == 36 && length (splitc '-' s) == 5 {- Records the uuid in the log, if it's not already there. -} recordUUID :: UUID -> Annex () diff --git a/Remote.hs b/Remote.hs index 9479e72d1b..8c774915a8 100644 --- a/Remote.hs +++ b/Remote.hs @@ -140,7 +140,7 @@ byName' n = go . filter matching <$> remoteList byNameOrGroup :: RemoteName -> Annex [Remote] byNameOrGroup n = go =<< getConfigMaybe (ConfigKey ("remotes." ++ n)) where - go (Just l) = catMaybes <$> mapM (byName . Just) (split " " l) + go (Just l) = catMaybes <$> mapM (byName . Just) (splitc ' ' l) go Nothing = maybeToList <$> byName (Just n) {- Only matches remote name, not UUID -} diff --git a/Remote/BitTorrent.hs b/Remote/BitTorrent.hs index 0ec78aa642..2f29f5baa4 100644 --- a/Remote/BitTorrent.hs +++ b/Remote/BitTorrent.hs @@ -302,7 +302,7 @@ ariaProgress (Just sz) meter ps = do =<< ariaParams ps parseAriaProgress :: Integer -> ProgressParser -parseAriaProgress totalsize = go [] . reverse . split ['\r'] +parseAriaProgress totalsize = go [] . reverse . splitc '\r' where go remainder [] = (Nothing, remainder) go remainder (x:xs) = case readish (findpercent x) of diff --git a/Remote/Bup.hs b/Remote/Bup.hs index 332e8d5dc6..75b3795587 100644 --- a/Remote/Bup.hs +++ b/Remote/Bup.hs @@ -254,7 +254,7 @@ bup2GitRemote r else giveup "please specify an absolute path" | otherwise = Git.Construct.fromUrl $ "ssh://" ++ host ++ slash dir where - bits = split ":" r + bits = splitc ':' r host = Prelude.head bits dir = intercalate ":" $ drop 1 bits -- "host:~user/dir" is not supported specially by bup; diff --git a/Remote/Helper/Encryptable.hs b/Remote/Helper/Encryptable.hs index 45ceae0681..029ac4b09c 100644 --- a/Remote/Helper/Encryptable.hs +++ b/Remote/Helper/Encryptable.hs @@ -165,7 +165,7 @@ extractCipher c = case (M.lookup "cipher" c, Just $ SharedCipher (fromB64bs t) _ -> Nothing where - readkeys = KeyIds . split "," + readkeys = KeyIds . splitc ',' describeEncryption :: RemoteConfig -> String describeEncryption c = case extractCipher c of diff --git a/Types/RefSpec.hs b/Types/RefSpec.hs index 091631abd5..c71e57d92e 100644 --- a/Types/RefSpec.hs +++ b/Types/RefSpec.hs @@ -25,7 +25,7 @@ allRefSpec :: RefSpec allRefSpec = [AddMatching $ compileGlob "*" CaseSensative] parseRefSpec :: String -> Either String RefSpec -parseRefSpec v = case partitionEithers (map mk $ split ":" v) of +parseRefSpec v = case partitionEithers (map mk $ splitc ':' v) of ([],refspec) -> Right refspec (e:_,_) -> Left e where diff --git a/Upgrade/V1.hs b/Upgrade/V1.hs index c82cf92f51..725bb40898 100644 --- a/Upgrade/V1.hs +++ b/Upgrade/V1.hs @@ -148,7 +148,7 @@ readKey1 v , keyMtime = t } where - bits = split ":" v + bits = splitc ':' v b = Prelude.head bits n = intercalate ":" $ drop (if wormy then 3 else 1) bits t = if wormy diff --git a/Utility/DottedVersion.hs b/Utility/DottedVersion.hs index ebf4c0bd1d..3198b1ce27 100644 --- a/Utility/DottedVersion.hs +++ b/Utility/DottedVersion.hs @@ -25,7 +25,7 @@ instance Show DottedVersion where normalize :: String -> DottedVersion normalize v = DottedVersion v $ sum $ mult 1 $ reverse $ extend precision $ take precision $ - map readi $ split "." v + map readi $ splitc '.' v where extend n l = l ++ replicate (n - length l) 0 mult _ [] = [] diff --git a/Utility/Gpg.hs b/Utility/Gpg.hs index 1185152220..a5d3820832 100644 --- a/Utility/Gpg.hs +++ b/Utility/Gpg.hs @@ -162,7 +162,7 @@ findPubKeys :: GpgCmd -> String -> IO KeyIds findPubKeys cmd for = KeyIds . parse . lines <$> readStrict cmd params where params = [Param "--with-colons", Param "--list-public-keys", Param for] - parse = mapMaybe (keyIdField . split ":") + parse = mapMaybe (keyIdField . splitc ':') keyIdField ("pub":_:_:_:f:_) = Just f keyIdField _ = Nothing @@ -175,7 +175,7 @@ secretKeys cmd = catchDefaultIO M.empty makemap where makemap = M.fromList . parse . lines <$> readStrict cmd params params = [Param "--with-colons", Param "--list-secret-keys", Param "--fixed-list-mode"] - parse = extract [] Nothing . map (split ":") + parse = extract [] Nothing . map (splitc ':') extract c (Just keyid) (("uid":_:_:_:_:_:_:_:_:userid:_):rest) = extract ((keyid, decode_c userid):c) Nothing rest extract c (Just keyid) rest@(("sec":_):_) = diff --git a/Utility/Lsof.hs b/Utility/Lsof.hs index 27d34b5925..e3ed709ec3 100644 --- a/Utility/Lsof.hs +++ b/Utility/Lsof.hs @@ -107,7 +107,7 @@ parseFormatted s = bundle $ go [] $ lines s parsemode ('u':_) = OpenReadWrite parsemode _ = OpenUnknown - splitnull = split "\0" + splitnull = splitc '\0' parsefail = error $ "failed to parse lsof output: " ++ show s diff --git a/Utility/Misc.hs b/Utility/Misc.hs index 4498c0a03e..564935ddb6 100644 --- a/Utility/Misc.hs +++ b/Utility/Misc.hs @@ -45,6 +45,14 @@ separate c l = unbreak $ break c l | null b = r | otherwise = (a, tail b) +{- Split on a single character. This is over twice as fast as using + - Data.List.Utils.split on a list of length 1, while producing + - identical results. -} +splitc :: Char -> String -> [String] +splitc c s = case break (== c) s of + (i, _c:rest) -> i : splitc c rest + (i, []) -> i : [] + {- Breaks out the first line. -} firstLine :: String -> String firstLine = takeWhile (/= '\n') diff --git a/Utility/Quvi.hs b/Utility/Quvi.hs index d33d79bb8e..ff1ad854c5 100644 --- a/Utility/Quvi.hs +++ b/Utility/Quvi.hs @@ -124,14 +124,14 @@ supported Quvi09 url = (firstlevel <&&> secondlevel) Nothing -> return False Just auth -> do let domain = map toLower $ uriRegName auth - let basedomain = intercalate "." $ reverse $ take 2 $ reverse $ split "." domain + let basedomain = intercalate "." $ reverse $ take 2 $ reverse $ splitc '.' domain any (\h -> domain `isSuffixOf` h || basedomain `isSuffixOf` h) . map (map toLower) <$> listdomains Quvi09 secondlevel = snd <$> processTranscript "quvi" (toCommand [Param "dump", Param "-o", Param url]) Nothing listdomains :: QuviVersion -> IO [String] -listdomains Quvi09 = concatMap (split ",") +listdomains Quvi09 = concatMap (splitc ',') . concatMap (drop 1 . words) . filter ("domains: " `isPrefixOf`) . lines <$> readQuvi (toCommand [Param "info", Param "-p", Param "domains"]) diff --git a/Utility/Rsync.hs b/Utility/Rsync.hs index d3fe98120f..d3823a5283 100644 --- a/Utility/Rsync.hs +++ b/Utility/Rsync.hs @@ -24,7 +24,7 @@ rsyncShell command = [Param "-e", Param $ unwords $ map escape (toCommand comman {- rsync requires some weird, non-shell like quoting in - here. A doubled single quote inside the single quoted - string is a single quote. -} - escape s = "'" ++ intercalate "''" (split "'" s) ++ "'" + escape s = "'" ++ intercalate "''" (splitc '\'' s) ++ "'" {- Runs rsync in server mode to send a file. -} rsyncServerSend :: [CommandParam] -> FilePath -> IO Bool @@ -123,7 +123,7 @@ parseRsyncProgress = go [] . reverse . progresschunks {- Find chunks that each start with delim. - The first chunk doesn't start with it - (it's empty when delim is at the start of the string). -} - progresschunks = drop 1 . split [delim] + progresschunks = drop 1 . splitc delim findbytesstart s = dropWhile isSpace s parsebytes :: String -> Maybe Integer diff --git a/Utility/SafeCommand.hs b/Utility/SafeCommand.hs index 5ce17a8453..bef0a619d3 100644 --- a/Utility/SafeCommand.hs +++ b/Utility/SafeCommand.hs @@ -11,7 +11,7 @@ module Utility.SafeCommand where import System.Exit import Utility.Process -import Data.String.Utils +import Utility.Misc import System.FilePath import Data.Char import Data.List @@ -86,7 +86,7 @@ shellEscape :: String -> String shellEscape f = "'" ++ escaped ++ "'" where -- replace ' with '"'"' - escaped = intercalate "'\"'\"'" $ split "'" f + escaped = intercalate "'\"'\"'" $ splitc '\'' f -- | Unescapes a set of shellEscaped words or filenames. shellUnEscape :: String -> [String]