optimise catfile interface with ByteString and Attoparsec

Around 3% total speedup.

Profiling git annex find --not --in web, it's now bytestring end-to-end,
and there is only a little added overhead in eg accessing the Annex
state MVar (3%). The rest of the runtime is spent reading symlinks, and
in attoparsec.

This feels like the end of the optimisation road, without a major change
like caching information for faster queries.
This commit is contained in:
Joey Hess 2020-04-10 14:03:40 -04:00
parent 2caf579718
commit 86426036a0
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 31 additions and 23 deletions

View file

@ -3,7 +3,7 @@ git-annex (8.20200331) UNRELEASED; urgency=medium
* Improve git-annex's ability to find the path to its program, * Improve git-annex's ability to find the path to its program,
especially when it needs to run itself in another repo to upgrade it. especially when it needs to run itself in another repo to upgrade it.
* adb: Better messages when the adb command is not installed. * adb: Better messages when the adb command is not installed.
* Sped up query commands that read the git-annex branch by around 6%. * Sped up query commands that read the git-annex branch by around 9%.
* Various speed improvements gained by using ByteStrings for git refs and * Various speed improvements gained by using ByteStrings for git refs and
shas. shas.

View file

@ -1,6 +1,6 @@
{- git cat-file interface {- git cat-file interface
- -
- Copyright 2011-2019 Joey Hess <id@joeyh.name> - Copyright 2011-2020 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -24,6 +24,9 @@ module Git.CatFile (
import System.IO import System.IO
import qualified Data.ByteString as S import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Char8 as S8
import qualified Data.Attoparsec.ByteString as A
import qualified Data.Attoparsec.ByteString.Char8 as A8
import qualified Data.Map as M import qualified Data.Map as M
import Data.String import Data.String
import Data.Char import Data.Char
@ -83,9 +86,9 @@ catObject h object = maybe L.empty fst3 <$> catObjectDetails h object
catObjectDetails :: CatFileHandle -> Ref -> IO (Maybe (L.ByteString, Sha, ObjectType)) catObjectDetails :: CatFileHandle -> Ref -> IO (Maybe (L.ByteString, Sha, ObjectType))
catObjectDetails h object = query (catFileProcess h) object newlinefallback $ \from -> do catObjectDetails h object = query (catFileProcess h) object newlinefallback $ \from -> do
header <- hGetLine from header <- S8.hGetLine from
case parseResp object header of case parseResp object header of
Just (ParsedResp sha size objtype) -> do Just (ParsedResp sha objtype size) -> do
content <- S.hGet from (fromIntegral size) content <- S.hGet from (fromIntegral size)
eatchar '\n' from eatchar '\n' from
return $ Just (L.fromChunks [content], sha, objtype) return $ Just (L.fromChunks [content], sha, objtype)
@ -113,9 +116,9 @@ catObjectDetails h object = query (catFileProcess h) object newlinefallback $ \f
{- Gets the size and type of an object, without reading its content. -} {- Gets the size and type of an object, without reading its content. -}
catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Sha, FileSize, ObjectType)) catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Sha, FileSize, ObjectType))
catObjectMetaData h object = query (checkFileProcess h) object newlinefallback $ \from -> do catObjectMetaData h object = query (checkFileProcess h) object newlinefallback $ \from -> do
resp <- hGetLine from resp <- S8.hGetLine from
case parseResp object resp of case parseResp object resp of
Just (ParsedResp sha size objtype) -> Just (ParsedResp sha objtype size) ->
return $ Just (sha, size, objtype) return $ Just (sha, size, objtype)
Just DNE -> return Nothing Just DNE -> return Nothing
Nothing -> error $ "unknown response from git cat-file " ++ show (resp, object) Nothing -> error $ "unknown response from git cat-file " ++ show (resp, object)
@ -127,35 +130,39 @@ catObjectMetaData h object = query (checkFileProcess h) object newlinefallback $
objtype <- queryObjectType object (gitRepo h) objtype <- queryObjectType object (gitRepo h)
return $ (,,) <$> sha <*> sz <*> objtype return $ (,,) <$> sha <*> sz <*> objtype
data ParsedResp = ParsedResp Sha FileSize ObjectType | DNE data ParsedResp = ParsedResp Sha ObjectType FileSize | DNE
deriving (Show)
query :: CoProcess.CoProcessHandle -> Ref -> IO a -> (Handle -> IO a) -> IO a query :: CoProcess.CoProcessHandle -> Ref -> IO a -> (Handle -> IO a) -> IO a
query hdl object newlinefallback receive query hdl object newlinefallback receive
-- git cat-file --batch uses a line based protocol, so when the -- git cat-file --batch uses a line based protocol, so when the
-- filename itself contains a newline, have to fall back to another -- filename itself contains a newline, have to fall back to another
-- method of getting the information. -- method of getting the information.
| '\n' `elem` s = newlinefallback | '\n' `S8.elem` s = newlinefallback
-- git strips carriage return from the end of a line, out of some -- git strips carriage return from the end of a line, out of some
-- misplaced desire to support windows, so also use the newline -- misplaced desire to support windows, so also use the newline
-- fallback for those. -- fallback for those.
| "\r" `isSuffixOf` s = newlinefallback | "\r" `S8.isSuffixOf` s = newlinefallback
| otherwise = CoProcess.query hdl send receive | otherwise = CoProcess.query hdl send receive
where where
send to = hPutStrLn to s send to = S8.hPutStrLn to s
s = fromRef object s = fromRef' object
parseResp :: Ref -> String -> Maybe ParsedResp parseResp :: Ref -> S.ByteString -> Maybe ParsedResp
parseResp object l parseResp object s
| " missing" `isSuffixOf` l -- less expensive than full check | " missing" `S.isSuffixOf` s -- less expensive than full check
&& l == fromRef object ++ " missing" = Just DNE && s == fromRef' object <> " missing" = Just DNE
| otherwise = case words l of | otherwise = eitherToMaybe $ A.parseOnly respParser s
[sha, objtype, size] -> case extractSha (encodeBS sha) of
Just sha' -> case (readObjectType (encodeBS objtype), reads size) of respParser :: A.Parser ParsedResp
(Just t, [(bytes, "")]) -> respParser = ParsedResp
Just $ ParsedResp sha' bytes t <$> (maybe (fail "bad sha") return . extractSha =<< nextword)
_ -> Nothing <* A8.char ' '
Nothing -> Nothing <*> (maybe (fail "bad object type") return . readObjectType =<< nextword)
_ -> Nothing <* A8.char ' '
<*> A8.decimal
where
nextword = A8.takeTill (== ' ')
querySingle :: CommandParam -> Ref -> Repo -> (Handle -> IO a) -> IO (Maybe a) querySingle :: CommandParam -> Ref -> Repo -> (Handle -> IO a) -> IO (Maybe a)
querySingle o r repo reader = assertLocal repo $ querySingle o r repo reader = assertLocal repo $

View file

@ -101,6 +101,7 @@ newtype RefDate = RefDate String
{- Types of objects that can be stored in git. -} {- Types of objects that can be stored in git. -}
data ObjectType = BlobObject | CommitObject | TreeObject data ObjectType = BlobObject | CommitObject | TreeObject
deriving (Show)
readObjectType :: S.ByteString -> Maybe ObjectType readObjectType :: S.ByteString -> Maybe ObjectType
readObjectType "blob" = Just BlobObject readObjectType "blob" = Just BlobObject