{-# LANGUAGE BangPatterns #-} {-# LANGUAGE MultiWayIf #-} {-# LANGUAGE OverloadedStrings #-} module Data.JsonStream.TokenParser ( Element(..) , TokenResult(..) , tokenParser ) where import Control.Applicative import Control.Monad (replicateM, when, (>=>)) import qualified Data.Aeson as AE import qualified Data.ByteString as BSW import qualified Data.ByteString.Char8 as BS import Data.Char (isDigit, isDigit, isLower, isSpace) import Data.Scientific (scientific) import qualified Data.Text as T import Data.Text.Encoding (decodeUtf8', encodeUtf8) data Element = ArrayBegin | ArrayEnd | ObjectBegin | ObjectEnd | StringBegin BS.ByteString | StringContent BS.ByteString | StringEnd | JValue AE.Value deriving (Show, Eq) -- Internal Interface for parsing monad data TokenResult' a = TokMoreData' (BS.ByteString -> TokenParser a) BS.ByteString | PartialResult' Element (TokenParser a) BS.ByteString -- ^ found element, continuation, actual parsing view - so that we can report the unparsed -- data when the parsing finishes. | TokFailed' BS.ByteString | Intermediate' a -- | Public interface for parsing JSON tokens. data TokenResult = TokMoreData (BS.ByteString -> TokenResult) BS.ByteString | PartialResult Element (TokenResult) BS.ByteString -- ^ found element, continuation, actual parsing view - so that we can report the unparsed -- data when the parsing finishes. | TokFailed BS.ByteString -- For debugging purposes instance Show TokenResult where show (TokMoreData _ ctx) = "(TokMoreData' + " ++ show ctx ++ ")" show (TokFailed _) = "TokFailed'" show (PartialResult el _ rest) = "(PartialResult' " ++ show el ++ " " ++ show rest ++ ")" data State = State { stData :: BS.ByteString , stContext :: BS.ByteString } newtype TokenParser a = TokenParser { runTokParser :: State -> (TokenResult' a, State) } instance Monad TokenParser where return x = TokenParser $ \s -> (Intermediate' x, s) {-# INLINE return #-} m >>= mpost = TokenParser $ \s -> let (res, newstate) = runTokParser m s in case res of TokMoreData' cont context -> (TokMoreData' (cont >=> mpost) context, newstate) PartialResult' el tokp context -> (PartialResult' el (tokp >>= mpost) context, newstate) TokFailed' context -> (TokFailed' context, newstate) Intermediate' result -> runTokParser (mpost result) newstate {-# INLINE (>>=) #-} instance Functor TokenResult' where fmap f (TokMoreData' newp ctx) = TokMoreData' (fmap f . newp) ctx fmap f (PartialResult' el tok ctx) = PartialResult' el (fmap f tok) ctx fmap _ (TokFailed' ctx) = TokFailed' ctx fmap f (Intermediate' a) = Intermediate' (f a) instance Applicative TokenParser where pure = return f <*> param = do mf <- f mparam <- param return (mf mparam) instance Functor TokenParser where fmap f tokp = TokenParser $ \s -> let (res, newstate) = runTokParser tokp s in (fmap f res, newstate) failTok :: TokenParser a failTok = TokenParser $ \s -> (TokFailed' (stContext s), s) {-# INLINE isBreakChar #-} isBreakChar :: Char -> Bool isBreakChar c = isSpace c || (c == '{') || (c == '[') || (c == '}') || (c == ']') || (c == ',') {-# INLINE peekChar #-} peekChar :: TokenParser Char peekChar = TokenParser handle where -- handle :: State -> (TokenResult' a, State) handle st@(State dta context) | BS.null dta = (TokMoreData' (\newdta -> TokenParser $ \_ -> handle (State newdta (BS.append context newdta))) context , st) | otherwise = (Intermediate' (BS.head dta), st) {-# INLINE pickChar #-} pickChar :: TokenParser Char pickChar = TokenParser handle where handle st@(State dta context) | BS.null dta = (TokMoreData' (\newdta -> TokenParser $ \_ -> handle (State newdta (BS.append context newdta))) context , st) | otherwise = (Intermediate' (BS.head dta), State (BS.tail dta) context) {-# INLINE yield #-} yield :: Element -> TokenParser () yield el = TokenParser $ \state@(State dta ctx) -> (PartialResult' el (contparse dta) ctx, state) where -- Use data as new context contparse dta = TokenParser $ const (Intermediate' (), State dta dta ) -- | Return SOME input satisfying predicate or none, if the next element does not satisfy -- Return tuple (str satisfying predicate, true_if_next_char_does_not_satisfy) {-# INLINE getWhile' #-} getWhile' :: (Char -> Bool) -> TokenParser (BS.ByteString, Bool) getWhile' predicate = do char <- peekChar if predicate char then getBuf else return ("", True) where getBuf = TokenParser $ \(State dta ctx) -> let (st,rest) = BS.span predicate dta in (Intermediate' (st, not (BS.null rest)), State rest ctx) -- | Read ALL input satisfying predicate {-# INLINE getWhile #-} getWhile :: (Char -> Bool) -> TokenParser BS.ByteString getWhile predicate = do (dta, complete) <- getWhile' predicate if complete then return dta else loop [dta] where loop acc = do (dta, complete) <- getWhile' predicate if complete then return $! BS.concat $ reverse (dta:acc) else loop (dta:acc) -- | Parse unquoted identifier - true/false/null parseIdent :: TokenParser () parseIdent = do ident <- getWhile isLower nextchar <- peekChar if | isBreakChar nextchar -> toTemp ident -- We found a barrier -> parse | otherwise -> failTok where toTemp "true" = yield $ JValue $ AE.Bool True toTemp "false" = yield $ JValue $ AE.Bool False toTemp "null" = yield $ JValue AE.Null toTemp _ = failTok parseUnicode :: TokenParser Char parseUnicode = do lst <- replicateM 4 pickChar return $! toEnum $ foldl1 (\a b -> 16 * a + b) $ map hexCharToInt lst where hexCharToInt :: Char -> Int hexCharToInt c | c >= 'A' && c <= 'F' = 10 + (fromEnum c - fromEnum 'A') | c >= 'a' && c <= 'f' = 10 + (fromEnum c - fromEnum 'a') | isDigit c = fromEnum c - fromEnum '0' | otherwise = error "Incorrect hex input, internal error." -- | Parse string, when finished check if we are object in dict (followed by :) or just a string parseString :: TokenParser () parseString = do -- leading '"' removed upstream (firstpart, _) <- getWhile' (\c -> c /= '"' && c /= '\\' ) chr <- peekChar if chr == '"' then pickChar >> handleDecode firstpart else do yield $ StringBegin firstpart handleString where handleDecode str = case decodeUtf8' str of Left _ -> failTok Right val -> yield $ JValue $ AE.String val handleString = do chr <- peekChar case chr of '"' -> do _ <- pickChar yield StringEnd '\\' -> do _ <- pickChar specchr <- pickChar nchr <- parseSpecChar specchr yield $ StringContent $ encodeUtf8 (T.singleton nchr) handleString _ -> do (dstr, _) <- getWhile' (\c -> c /= '"' && c /= '\\' ) yield $ StringContent dstr handleString parseSpecChar '"' = return '"' parseSpecChar '\\' = return '\\' parseSpecChar '/' = return '/' parseSpecChar 'b' = return '\b' parseSpecChar 'f' = return '\f' parseSpecChar 'n' = return '\n' parseSpecChar 'r' = return '\r' parseSpecChar 't' = return '\t' parseSpecChar 'u' = parseUnicode parseSpecChar c = return c parseNumber :: TokenParser () parseNumber = do tnumber <- getWhile (\c -> isDigit c || c == '.' || c == '+' || c == '-' || c == 'e' || c == 'E') let (csign, r1) = parseSign tnumber :: (Int, BS.ByteString) ((num, numdigits), r2) = parseDecimal r1 :: ((Integer, Int), BS.ByteString) ((frac, frdigits), r3) = parseFract r2 :: ((Int, Int), BS.ByteString) (texp, rest) = parseE r3 when (numdigits == 0 || not (BS.null rest)) failTok let dpart = fromIntegral csign * (num * (10 ^ frdigits) + fromIntegral frac) :: Integer e = texp - frdigits yield $ JValue $ AE.Number $ scientific dpart e where parseFract txt | BS.null txt = ((0, 0), txt) | BS.head txt == '.' = parseDecimal (BS.tail txt) | otherwise = ((0,0), txt) parseE txt | BS.null txt = (0, txt) | firstc == 'e' || firstc == 'E' = let (sign, rest) = parseSign (BS.tail txt) ((dnum, _), trest) = parseDecimal rest :: ((Int, Int), BS.ByteString) in (dnum * sign, trest) | otherwise = (0, txt) where firstc = BS.head txt parseSign txt | BS.null txt = (1, txt) | BS.head txt == '+' = (1, BS.tail txt) | BS.head txt == '-' = (-1, BS.tail txt) | otherwise = (1, txt) parseDecimal txt | BS.null txt = ((0, 0), txt) | otherwise = parseNum txt (0,0) -- parseNum :: BS.ByteString -> (Integer, Int) -> ((Integer, Int), BS.ByteString) parseNum txt (!start, !digits) | BS.null txt = ((start, digits), txt) | dchr >= 48 && dchr <= 57 = parseNum (BS.tail txt) (start * 10 + fromIntegral (dchr - 48), digits + 1) | otherwise = ((start, digits), txt) where dchr = BSW.head txt {-# INLINE peekCharInMain #-} -- Specialized version of peek char for main function so that we get faster performance peekCharInMain :: TokenParser Char peekCharInMain = TokenParser handle where handle st@(State dta ctx) | BS.null dta = (TokMoreData' (\newdta -> TokenParser $ \_ -> handle (State newdta (BS.append ctx newdta))) ctx , st) | chr == '[' = (PartialResult' ArrayBegin contparse ctx, st) | chr == ']' = (PartialResult' ArrayEnd contparse ctx, st) | chr == '{' = (PartialResult' ObjectBegin contparse ctx, st) | chr == '}' = (PartialResult' ObjectEnd contparse ctx, st) | isBlankChar chr = handle (State (BS.dropWhile isBlankChar dta) ctx) | chr == '"' = runTokParser (parseString >> peekCharInMain) (State rest ctx) | otherwise = (Intermediate' (BS.head dta), st) where chr = BS.head dta rest = BS.tail dta -- Use data as new context contparse = TokenParser $ const $ handle (State rest rest) isBlankChar c = c == ',' || c == ':' || isSpace c {-# INLINE mainParser #-} mainParser :: TokenParser () mainParser = do chr <- peekCharInMain case chr of 't' -> parseIdent 'f' -> parseIdent 'n' -> parseIdent '-' -> parseNumber _| isDigit chr -> parseNumber | otherwise -> failTok -- | Incremental lexer tokenParser :: BS.ByteString -> TokenResult tokenParser dta = handle $ runTokParser mainParser (State dta dta) where handle (TokMoreData' ntp ctx, st) = TokMoreData (\ndta -> handle $ runTokParser (ntp ndta) st) ctx handle (PartialResult' el ntp ctx, st) = PartialResult el (handle $ runTokParser ntp st) ctx handle (TokFailed' ctx, _) = TokFailed ctx handle (Intermediate' _, st) = handle $ runTokParser mainParser st