attoparsec parser for presence logs

This commit is contained in:
Joey Hess 2019-01-03 15:27:29 -04:00
parent bfc9039ead
commit ef8ddaa713
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 54 additions and 10 deletions

View file

@ -17,6 +17,7 @@ import Prelude
import Utility.Env
import Utility.TimeStamp
import Utility.QuickCheck
import qualified Data.Attoparsec.ByteString.Lazy as A
-- | Some very old logs did not have any time stamp at all;
-- Unknown is used for those.
@ -44,3 +45,6 @@ formatVectorClock (VectorClock t) = show t
parseVectorClock :: String -> Maybe VectorClock
parseVectorClock t = VectorClock <$> parsePOSIXTime t
vectorClockParser :: A.Parser VectorClock
vectorClockParser = VectorClock <$> parserPOSIXTime

View file

@ -36,6 +36,12 @@ DAMAGE.
module Logs.Line where
import Common
import qualified Data.Attoparsec.ByteString.Lazy as A
import Data.Attoparsec.ByteString.Char8 (isEndOfLine)
import qualified Data.DList as D
-- This is the same as Data.List.lines, with \r added.
-- This works around some versions of git-annex which wrote \r
-- into git-annex branch files on Windows. Those \r's sometimes
@ -49,3 +55,27 @@ splitLines s = cons (case break (\c -> c == '\n' || c == '\r') s of
_:s'' -> splitLines s''))
where
cons ~(h, t) = h : t
{- Applies a parser to each line of a log file.
-
- If the parser fails to parse a line, that line is skipped, instead of
- the overall parse failing. This is generally a good idea in case a newer
- version of git-annex somehow changed the format of the log file.
-
- Any combination of \r and \n are taken to be the end of the line.
- (Some versions of git-annex on Windows wrote \r into git-annex branch
- files, and multiple \r's sometimes accumulated.)
-
- The parser does not itself need to avoid parsing beyond the end of line;
- this is implemented only pass the content of a line to the parser.
-}
parseLogLines :: A.Parser a -> A.Parser [a]
parseLogLines parser = go D.empty
where
go dl = do
line <- A.takeTill isEndOfLine
A.skipWhile isEndOfLine
let dl' = case A.parseOnly parser line of
Left _ -> dl
Right v -> D.snoc dl v
(A.endOfInput *> return (D.toList dl')) <|> go dl'

View file

@ -15,6 +15,8 @@ import Utility.QuickCheck
import qualified Data.Map as M
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString as S
import qualified Data.Attoparsec.ByteString.Lazy as A
import Data.Attoparsec.ByteString.Char8 (char, anyChar)
import Data.ByteString.Builder
newtype LogInfo = LogInfo { fromLogInfo :: S.ByteString }
@ -32,17 +34,25 @@ instance Show LogLine where
data LogStatus = InfoPresent | InfoMissing | InfoDead
deriving (Eq, Show, Bounded, Enum)
{- Parses a log file. Unparseable lines are ignored. -}
parseLog :: L.ByteString -> [LogLine]
parseLog = mapMaybe parseline . splitLines . decodeBL
where
parseline l = LogLine
<$> parseVectorClock c
<*> parseStatus s
<*> pure (LogInfo (encodeBS rest))
where
(c, pastc) = separate (== ' ') l
(s, rest) = separate (== ' ') pastc
parseLog = fromMaybe [] . A.maybeResult . A.parse (logParser <* A.endOfInput)
logParser :: A.Parser [LogLine]
logParser = parseLogLines $ LogLine
<$> vectorClockParser
<* char ' '
<*> statusParser
<* char ' '
<*> (LogInfo <$> A.takeByteString)
statusParser :: A.Parser LogStatus
statusParser = do
c <- anyChar
case c of
'1' -> return InfoPresent
'0' -> return InfoMissing
'X' -> return InfoDead
_ -> fail "unknown status character"
parseStatus :: String -> Maybe LogStatus
parseStatus "1" = Just InfoPresent