git pkt-line format

Git uses pkt-line in the pack and http protocols, and for the long-running
filter processes protocol as well.

This should be a quite efficient parser and builder since it uses
attoparsec and bytestring-builder.

This adds a dependency on attoparsec, but it's a free dependency because
eg aeson depends on attoparsec and git-annex depends on aeson.

This commit was supported by the NSF-funded DataLad project.
This commit is contained in:
Joey Hess 2018-08-10 13:38:29 -04:00
parent 816e0e356b
commit 1272ccdf1e
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 139 additions and 0 deletions

136
Git/Protocol/PktLine.hs Normal file
View file

@ -0,0 +1,136 @@
{- Git pkt-line format, as documented in
- git/Documentation/technical/protocol-common.txt
-
- Copyright 2018 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU GPL version 3 or higher.
-}
{-# LANGUAGE OverloadedStrings #-}
module Git.Protocol.PktLine (
PktLine,
flushPkt,
textPktLine,
pktLineText,
streamPktLine,
encodePktLine,
parsePktLine,
decodePktLine,
splitPktLine,
) where
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
import Data.Attoparsec.ByteString as A
import Data.Attoparsec.ByteString.Char8 as A
import Data.Attoparsec.ByteString.Lazy as AL
import Data.ByteString.Builder
import qualified Data.Text as T
import qualified Data.Text.Encoding as E
import qualified Data.Text.Encoding.Error as E
import Data.Monoid
import Data.Word
-- | A pkt-line encodes a variable length binary string with a maximum size.
--
-- This module only exports smart constructors for legal pkt-lines.
newtype PktLine = PktLine S.ByteString
deriving (Show)
-- | Maximum data that can be contained in a pkt-line, not
-- including the 4 byte length header.
maxPktLineContent :: Word16
maxPktLineContent = 65516
-- | Maximum length of a pkt-line, including the 4 byte size header.
maxPktLineLength :: Word16
maxPktLineLength = 65520
-- | The flush-pkt, a special case in the protocol that is often used to
-- eg, signal the end of a stream of binary data.
flushPkt :: PktLine
flushPkt = PktLine S.empty
-- | Encodes a Text as a PktLine. Fails if the Text it too large.
--
-- A trailing newline is included after it, as the protocol recommends
-- doing for non-binary data.
textPktLine :: T.Text -> Maybe PktLine
textPktLine t =
let b = E.encodeUtf8 t <> "\n"
in if S.length b > fromIntegral maxPktLineContent
then Nothing
else Just (PktLine b)
-- | Extracts Text from a PktLine. Any trailing newline is removed.
pktLineText :: PktLine -> Either E.UnicodeException T.Text
pktLineText (PktLine b) = case E.decodeUtf8' b of
Left e -> Left e
Right t ->
let (t', end) = T.splitAt (T.length t - 1) t
in if end == "\n"
then Right t'
else Right t
-- | Creates a stream of PktLines encoding a lazy ByteString of any size.
-- Note that the stream is not terminated with a flushPkt.
streamPktLine :: L.ByteString -> [PktLine]
streamPktLine l =
let (chunk, rest) = L.splitAt (fromIntegral maxPktLineContent) l
pktline = PktLine $ mconcat $ L.toChunks chunk
in if L.null rest
then pktline : []
else pktline : streamPktLine rest
-- | ByteString builder for a pkt-line.
encodePktLine :: PktLine -> Builder
encodePktLine (PktLine b)
-- Avoid sending an empty pkt-line; send a flush-pkt instead.
| S.null b = "0000"
| otherwise = lengthheader <> byteString b
where
-- The length header is always 4 bytes long, and includes
-- itself in its length.
lengthheader = word16HexFixed (fromIntegral (S.length b) + 4)
-- | Attoparsec parser for a pkt-line.
parsePktLine :: Parser PktLine
parsePktLine = do
len <- parseLengthHeader
if len == 0
then do
endOfInput
return flushPkt
else if len < 4
-- It's impossible for a pkt-line to be less than
-- 4 bytes long, since the length header is 4 bytes.
then fail "invalid pkt-line length"
else PktLine <$> A.take (len - 4)
parseLengthHeader :: Parser Int
parseLengthHeader = do
-- Attoparsec's hexidecimal parser will consume any amount
-- of hex, but the length header is limited to 4 bytes, so
-- take those and parse only them.
h <- A.take 4
-- Require all 4 bytes to be hexidecimal by using endOfInput.
case parseOnly (hexadecimal <* endOfInput) h of
Left e -> fail e
Right len
| len > fromIntegral maxPktLineLength ->
fail "pkt-line too long"
| otherwise -> return len
-- | The ByteString must contain only a pkt-line with no additional data
-- or this will fail.
decodePktLine :: S.ByteString -> Either String PktLine
decodePktLine = parseOnly (parsePktLine <* endOfInput)
-- | Split the next PktLine from a lazy ByteString, returning it and the
-- remainder of the ByteString.
splitPktLine :: L.ByteString -> Either String (PktLine, L.ByteString)
splitPktLine = go . AL.parse parsePktLine
where
go (AL.Done rest p) = Right (p, rest)
go (AL.Fail _ _ e) = Left e

1
debian/control vendored
View file

@ -12,6 +12,7 @@ Build-Depends:
libghc-pcre-light-dev, libghc-pcre-light-dev,
libghc-cryptonite-dev, libghc-cryptonite-dev,
libghc-memory-dev, libghc-memory-dev,
libghc-attoparsec-dev,
libghc-sandi-dev, libghc-sandi-dev,
libghc-utf8-string-dev, libghc-utf8-string-dev,
libghc-aws-dev (>= 0.9.2-2~), libghc-aws-dev (>= 0.9.2-2~),

View file

@ -364,6 +364,7 @@ Executable git-annex
cryptonite, cryptonite,
memory, memory,
split, split,
attoparsec,
QuickCheck (>= 2.1), QuickCheck (>= 2.1),
tasty (>= 0.7), tasty (>= 0.7),
tasty-hunit, tasty-hunit,
@ -846,6 +847,7 @@ Executable git-annex
Git.LsTree Git.LsTree
Git.Merge Git.Merge
Git.Objects Git.Objects
Git.Protocol.PktLine
Git.Queue Git.Queue
Git.Ref Git.Ref
Git.RefLog Git.RefLog