git-annex/Git/Protocol/PktLine.hs
Joey Hess 1272ccdf1e
git pkt-line format
Git uses pkt-line in the pack and http protocols, and for the long-running
filter processes protocol as well.

This should be a quite efficient parser and builder since it uses
attoparsec and bytestring-builder.

This adds a dependency on attoparsec, but it's a free dependency because
eg aeson depends on attoparsec and git-annex depends on aeson.

This commit was supported by the NSF-funded DataLad project.
2018-08-10 14:12:33 -04:00

136 lines
4.1 KiB
Haskell

{- Git pkt-line format, as documented in
- git/Documentation/technical/protocol-common.txt
-
- Copyright 2018 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU GPL version 3 or higher.
-}
{-# LANGUAGE OverloadedStrings #-}
module Git.Protocol.PktLine (
PktLine,
flushPkt,
textPktLine,
pktLineText,
streamPktLine,
encodePktLine,
parsePktLine,
decodePktLine,
splitPktLine,
) where
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
import Data.Attoparsec.ByteString as A
import Data.Attoparsec.ByteString.Char8 as A
import Data.Attoparsec.ByteString.Lazy as AL
import Data.ByteString.Builder
import qualified Data.Text as T
import qualified Data.Text.Encoding as E
import qualified Data.Text.Encoding.Error as E
import Data.Monoid
import Data.Word
-- | A pkt-line encodes a variable length binary string with a maximum size.
--
-- This module only exports smart constructors for legal pkt-lines.
newtype PktLine = PktLine S.ByteString
deriving (Show)
-- | Maximum data that can be contained in a pkt-line, not
-- including the 4 byte length header.
maxPktLineContent :: Word16
maxPktLineContent = 65516
-- | Maximum length of a pkt-line, including the 4 byte size header.
maxPktLineLength :: Word16
maxPktLineLength = 65520
-- | The flush-pkt, a special case in the protocol that is often used to
-- eg, signal the end of a stream of binary data.
flushPkt :: PktLine
flushPkt = PktLine S.empty
-- | Encodes a Text as a PktLine. Fails if the Text it too large.
--
-- A trailing newline is included after it, as the protocol recommends
-- doing for non-binary data.
textPktLine :: T.Text -> Maybe PktLine
textPktLine t =
let b = E.encodeUtf8 t <> "\n"
in if S.length b > fromIntegral maxPktLineContent
then Nothing
else Just (PktLine b)
-- | Extracts Text from a PktLine. Any trailing newline is removed.
pktLineText :: PktLine -> Either E.UnicodeException T.Text
pktLineText (PktLine b) = case E.decodeUtf8' b of
Left e -> Left e
Right t ->
let (t', end) = T.splitAt (T.length t - 1) t
in if end == "\n"
then Right t'
else Right t
-- | Creates a stream of PktLines encoding a lazy ByteString of any size.
-- Note that the stream is not terminated with a flushPkt.
streamPktLine :: L.ByteString -> [PktLine]
streamPktLine l =
let (chunk, rest) = L.splitAt (fromIntegral maxPktLineContent) l
pktline = PktLine $ mconcat $ L.toChunks chunk
in if L.null rest
then pktline : []
else pktline : streamPktLine rest
-- | ByteString builder for a pkt-line.
encodePktLine :: PktLine -> Builder
encodePktLine (PktLine b)
-- Avoid sending an empty pkt-line; send a flush-pkt instead.
| S.null b = "0000"
| otherwise = lengthheader <> byteString b
where
-- The length header is always 4 bytes long, and includes
-- itself in its length.
lengthheader = word16HexFixed (fromIntegral (S.length b) + 4)
-- | Attoparsec parser for a pkt-line.
parsePktLine :: Parser PktLine
parsePktLine = do
len <- parseLengthHeader
if len == 0
then do
endOfInput
return flushPkt
else if len < 4
-- It's impossible for a pkt-line to be less than
-- 4 bytes long, since the length header is 4 bytes.
then fail "invalid pkt-line length"
else PktLine <$> A.take (len - 4)
parseLengthHeader :: Parser Int
parseLengthHeader = do
-- Attoparsec's hexidecimal parser will consume any amount
-- of hex, but the length header is limited to 4 bytes, so
-- take those and parse only them.
h <- A.take 4
-- Require all 4 bytes to be hexidecimal by using endOfInput.
case parseOnly (hexadecimal <* endOfInput) h of
Left e -> fail e
Right len
| len > fromIntegral maxPktLineLength ->
fail "pkt-line too long"
| otherwise -> return len
-- | The ByteString must contain only a pkt-line with no additional data
-- or this will fail.
decodePktLine :: S.ByteString -> Either String PktLine
decodePktLine = parseOnly (parsePktLine <* endOfInput)
-- | Split the next PktLine from a lazy ByteString, returning it and the
-- remainder of the ByteString.
splitPktLine :: L.ByteString -> Either String (PktLine, L.ByteString)
splitPktLine = go . AL.parse parsePktLine
where
go (AL.Done rest p) = Right (p, rest)
go (AL.Fail _ _ e) = Left e