optimise catfile interface with ByteString and Attoparsec

Around 3% total speedup.

Profiling git annex find --not --in web, it's now bytestring end-to-end,
and there is only a little added overhead in eg accessing the Annex
state MVar (3%). The rest of the runtime is spent reading symlinks, and
in attoparsec.

This feels like the end of the optimisation road, without a major change
like caching information for faster queries.
This commit is contained in:
Joey Hess 2020-04-10 14:03:40 -04:00
parent 2caf579718
commit 86426036a0
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 31 additions and 23 deletions

View file

@ -3,7 +3,7 @@ git-annex (8.20200331) UNRELEASED; urgency=medium
* Improve git-annex's ability to find the path to its program,
especially when it needs to run itself in another repo to upgrade it.
* adb: Better messages when the adb command is not installed.
* Sped up query commands that read the git-annex branch by around 6%.
* Sped up query commands that read the git-annex branch by around 9%.
* Various speed improvements gained by using ByteStrings for git refs and
shas.

View file

@ -1,6 +1,6 @@
{- git cat-file interface
-
- Copyright 2011-2019 Joey Hess <id@joeyh.name>
- Copyright 2011-2020 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -24,6 +24,9 @@ module Git.CatFile (
import System.IO
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Char8 as S8
import qualified Data.Attoparsec.ByteString as A
import qualified Data.Attoparsec.ByteString.Char8 as A8
import qualified Data.Map as M
import Data.String
import Data.Char
@ -83,9 +86,9 @@ catObject h object = maybe L.empty fst3 <$> catObjectDetails h object
catObjectDetails :: CatFileHandle -> Ref -> IO (Maybe (L.ByteString, Sha, ObjectType))
catObjectDetails h object = query (catFileProcess h) object newlinefallback $ \from -> do
header <- hGetLine from
header <- S8.hGetLine from
case parseResp object header of
Just (ParsedResp sha size objtype) -> do
Just (ParsedResp sha objtype size) -> do
content <- S.hGet from (fromIntegral size)
eatchar '\n' from
return $ Just (L.fromChunks [content], sha, objtype)
@ -113,9 +116,9 @@ catObjectDetails h object = query (catFileProcess h) object newlinefallback $ \f
{- Gets the size and type of an object, without reading its content. -}
catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Sha, FileSize, ObjectType))
catObjectMetaData h object = query (checkFileProcess h) object newlinefallback $ \from -> do
resp <- hGetLine from
resp <- S8.hGetLine from
case parseResp object resp of
Just (ParsedResp sha size objtype) ->
Just (ParsedResp sha objtype size) ->
return $ Just (sha, size, objtype)
Just DNE -> return Nothing
Nothing -> error $ "unknown response from git cat-file " ++ show (resp, object)
@ -127,35 +130,39 @@ catObjectMetaData h object = query (checkFileProcess h) object newlinefallback $
objtype <- queryObjectType object (gitRepo h)
return $ (,,) <$> sha <*> sz <*> objtype
data ParsedResp = ParsedResp Sha FileSize ObjectType | DNE
data ParsedResp = ParsedResp Sha ObjectType FileSize | DNE
deriving (Show)
query :: CoProcess.CoProcessHandle -> Ref -> IO a -> (Handle -> IO a) -> IO a
query hdl object newlinefallback receive
-- git cat-file --batch uses a line based protocol, so when the
-- filename itself contains a newline, have to fall back to another
-- method of getting the information.
| '\n' `elem` s = newlinefallback
| '\n' `S8.elem` s = newlinefallback
-- git strips carriage return from the end of a line, out of some
-- misplaced desire to support windows, so also use the newline
-- fallback for those.
| "\r" `isSuffixOf` s = newlinefallback
| "\r" `S8.isSuffixOf` s = newlinefallback
| otherwise = CoProcess.query hdl send receive
where
send to = hPutStrLn to s
s = fromRef object
send to = S8.hPutStrLn to s
s = fromRef' object
parseResp :: Ref -> String -> Maybe ParsedResp
parseResp object l
| " missing" `isSuffixOf` l -- less expensive than full check
&& l == fromRef object ++ " missing" = Just DNE
| otherwise = case words l of
[sha, objtype, size] -> case extractSha (encodeBS sha) of
Just sha' -> case (readObjectType (encodeBS objtype), reads size) of
(Just t, [(bytes, "")]) ->
Just $ ParsedResp sha' bytes t
_ -> Nothing
Nothing -> Nothing
_ -> Nothing
parseResp :: Ref -> S.ByteString -> Maybe ParsedResp
parseResp object s
| " missing" `S.isSuffixOf` s -- less expensive than full check
&& s == fromRef' object <> " missing" = Just DNE
| otherwise = eitherToMaybe $ A.parseOnly respParser s
respParser :: A.Parser ParsedResp
respParser = ParsedResp
<$> (maybe (fail "bad sha") return . extractSha =<< nextword)
<* A8.char ' '
<*> (maybe (fail "bad object type") return . readObjectType =<< nextword)
<* A8.char ' '
<*> A8.decimal
where
nextword = A8.takeTill (== ' ')
querySingle :: CommandParam -> Ref -> Repo -> (Handle -> IO a) -> IO (Maybe a)
querySingle o r repo reader = assertLocal repo $

View file

@ -101,6 +101,7 @@ newtype RefDate = RefDate String
{- Types of objects that can be stored in git. -}
data ObjectType = BlobObject | CommitObject | TreeObject
deriving (Show)
readObjectType :: S.ByteString -> Maybe ObjectType
readObjectType "blob" = Just BlobObject