{-# LANGUAGE BangPatterns #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ViewPatterns #-} {-# OPTIONS_GHC -O2 #-} -- | Parser from lazy 'ByteString' to 'Tokens'. module Data.Aeson.Decoding.ByteString.Lazy ( lbsToTokens, ) where import Data.ByteString.Lazy (ByteString) import Data.Char (chr) import Data.Integer.Conversion (byteStringToInteger) import Data.Text (Text) import Data.Word (Word8) import qualified Data.Aeson.Key as Key import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as LBS import qualified Data.Scientific as Sci import Data.Aeson.Decoding.Internal import Data.Aeson.Decoding.Tokens import Data.Aeson.Internal.Text (unsafeDecodeASCII) import Data.Aeson.Internal.Unescape (unescapeText) import Data.Aeson.Internal.Word8 -- | Lex (and parse) lazy 'ByteString' into 'Tokens' stream. -- -- @since 2.1.2.0 -- lbsToTokens :: ByteString -> Tokens ByteString String lbsToTokens bs0 = goT bs0 id where goT :: Parser Tokens k goT (skipSpace -> bs) k = case LBS.uncons bs of Nothing -> tkErr "Unexpected end-of-input, expecting JSON value" Just (!w, !bs1) -> tokenCase w bs1 bs k tokenCase :: Word8 -- head -> ByteString -- tail -> ByteString -- whole input, needed for number parsing -> (ByteString -> k) -- continuation -> Tokens k String tokenCase W8_OPEN_CURLY !bs !_ k = TkRecordOpen (goR bs k) tokenCase W8_OPEN_SQUARE bs _ k = TkArrayOpen (goA bs k) tokenCase W8_DOUBLE_QUOTE bs _ k = scanStringLiteral (\t bs' -> TkText t (k bs')) tkErr bs tokenCase W8_MINUS bs _ k = scanNumberLiteral (\n bs' -> TkNumber (negateNumber n) (k bs')) tkErr bs tokenCase w _ wbs k | W8_0 <= w, w <= W8_9 = scanNumberLiteral (\n bs' -> TkNumber n (k bs')) tkErr wbs tokenCase W8_n bs _ k | Just bs1 <- stripPrefix "ull" 3 bs = TkLit LitNull (k bs1) tokenCase W8_t bs _ k | Just bs1 <- stripPrefix "rue" 3 bs = TkLit LitTrue (k bs1) tokenCase W8_f bs _ k | Just bs1 <- stripPrefix "alse" 4 bs = TkLit LitFalse (k bs1) tokenCase _ _ wbs _ = tkErr $ "Unexpected " ++ showBeginning wbs ++ ", expecting JSON value" -- Array goA :: Parser TkArray k goA (skipSpace -> bs) k = case LBS.uncons bs of Nothing -> tkErrEOF "JSON value or ]" Just (W8_CLOSE_SQUARE, !bs1) -> TkArrayEnd (k bs1) Just (w, !bs1) -> TkItem $ tokenCase w bs1 bs $ \bs2 -> goA1 bs2 k goA1 :: Parser TkArray k goA1 (skipSpace -> bs) k = case LBS.uncons bs of Nothing -> tkErrEOF ", or ]" Just (W8_CLOSE_SQUARE, !bs1) -> TkArrayEnd (k bs1) Just (W8_COMMA, !bs1) -> TkItem $ goT bs1 $ \bs2 -> goA1 bs2 k _ -> tkErrBS bs ", or ]" -- Record goR :: Parser TkRecord k goR (skipSpace -> bs) k = case LBS.uncons bs of Nothing -> tkErrEOF "record key literal or }" Just (W8_DOUBLE_QUOTE, !bs1) -> goRK bs1 k -- " Just (W8_CLOSE_CURLY, !bs1) -> TkRecordEnd (k bs1) -- } Just _ -> tkErrBS bs "record key literal or }" -- after record pair, expecting ," or } goR1 :: Parser TkRecord k goR1 (skipSpace -> bs) k = case LBS.uncons bs of Nothing -> tkErr "Unexpected end-of-input, expecting , or }" Just (W8_COMMA, !bs1) -> case LBS.uncons (skipSpace bs1) of Nothing -> tkErrEOF "key literal" Just (W8_DOUBLE_QUOTE, !bs2) -> goRK bs2 k Just _ -> tkErrBS bs "key literal" Just (W8_CLOSE_CURLY, !bs1) -> TkRecordEnd (k bs1) _ -> tkErr $ "Unexpected " ++ showBeginning bs ++ ", expecting , or }" -- key of record (after double quote) goRK :: Parser TkRecord k goRK bs1 k = scanStringLiteral (\t bs -> goRK' t bs k) tkErr bs1 -- after key of a record, expecting : goRK' :: Text -> Parser TkRecord k goRK' t (skipSpace -> bs) k = case LBS.uncons bs of Nothing -> tkErrEOF ":" Just (W8_COLON, !bs3) -> TkPair (Key.fromText t) $ goT bs3 $ \bs4 -> goR1 bs4 k Just _ -> tkErrBS bs ":" stripPrefix :: ByteString -> Int -> ByteString -> Maybe ByteString stripPrefix pfx n bs | LBS.isPrefixOf pfx bs = Just (LBS.drop (fromIntegral n) bs) | otherwise = Nothing {-# INLINE stripPrefix #-} type Parser tk k = ByteString -> (ByteString -> k) -> tk k String showBeginning :: ByteString -> String showBeginning = show . LBS.take 30 -- | Strip leading (ASCII) space skipSpace :: ByteString -> ByteString skipSpace = LBS.dropWhile $ \w -> w == 0x20 || w == 0x0a || w == 0x0d || w == 0x09 {-# INLINE skipSpace #-} tkErrEOF :: AsError t =>String -> t k String tkErrEOF expected = tkErr $ "Unexpected end-of-input, expecting " ++ expected {-# INLINE tkErrEOF #-} tkErrBS :: AsError t => ByteString -> String -> t k String tkErrBS bs expected = tkErr $ "Unexpected " ++ showBeginning bs ++ ", expecting " ++ expected {-# INLINE tkErrBS #-} lbsTake :: Int -> ByteString -> BS.ByteString lbsTake n bs = LBS.toStrict (LBS.take (fromIntegral n) bs) lbsDrop :: Int -> ByteString -> ByteString lbsDrop n = LBS.drop (fromIntegral n) ------------------------------------------------------------------------------- -- Text ------------------------------------------------------------------------------- scanStringLiteral :: forall r. (Text -> ByteString -> r) -> (String -> r) -> ByteString -> r scanStringLiteral ok err bs0 = go 0 bs0 where go :: Int -> ByteString -> r go !n !bs = case LBS.uncons bs of Nothing -> errEnd Just (34, _) -> ok (unsafeDecodeASCII (lbsTake n bs0)) (lbsDrop (n + 1) bs0) Just (92, bs') -> goSlash (n + 1) bs' Just (w8, bs') | w8 < 0x20 -> errCC | w8 >= 0x80 -> goEsc (n + 1) bs' | otherwise -> go (n + 1) bs' -- in goEsc and goSlash we don't need to check for control characters as unescapeText verifies that. goEsc :: Int -> ByteString -> r goEsc !n !bs = case LBS.uncons bs of Nothing -> errEnd Just (34, _) -> case unescapeText (lbsTake n bs0) of Right t -> ok t (lbsDrop (n + 1) bs0) Left e -> err (show e) Just (92, bs') -> goSlash (n + 1) bs' Just (_, bs') -> goEsc (n + 1) bs' goSlash :: Int -> ByteString -> r goSlash !n !bs = case LBS.uncons bs of Nothing -> errEnd Just (_, bs') -> goEsc (n + 1) bs' errEnd = err "Unexpected end-of-input while parsing string literal" errCC = err "Unespected control character while parsing string literal" ------------------------------------------------------------------------------- -- Number ------------------------------------------------------------------------------- -- -- number := integer fraction exponent -- integer := 0 | [1-9][0-9]* | -0 | -[1-9][0-9]* -- fraction := "" | . [0-9]+ -- exponent := "" | E sign [0-9]+ | e sign [0-9]+ -- sign := "" | - | + -- -- This scanner doesn't recognize the leading minus sign, we recognize only integer := 0 | [1-9][0-9]*, -- as the minus sign is recognized by outer scanner already. -- scanNumberLiteral :: forall r. (Number -> ByteString -> r) -> (String -> r) -> ByteString -> r scanNumberLiteral kont err bs0 = state_start bs0 where state_start :: ByteString -> r state_start !bs = case LBS.uncons bs of Nothing -> errEnd Just (w8, bs') | W8_0 < w8, w8 <= W8_9 -> state_i1 1 bs' | W8_0 == w8 -> state_after0 bs' | otherwise -> err $ "Unexpected " ++ show w8 ++ " while parsing number literal" state_after0 :: ByteString -> r state_after0 !bs = case LBS.uncons bs of Nothing -> kont (NumInteger 0) bs Just (w8, bs') | W8_0 <= w8, w8 <= W8_9 -> err "Number literal with leading zero" | W8_DOT == w8 -> go_dec 0 bs' | W8_e == w8 || W8_E == w8 -> go_sci 0 0 bs' | otherwise -> kont (NumInteger 0) bs state_i1 :: Int -> ByteString -> r state_i1 !n !bs = case LBS.uncons bs of Nothing -> kont (NumInteger int) bs Just (w8, bs') | W8_0 <= w8, w8 <= W8_9 -> state_i1 (n + 1) bs' | W8_DOT == w8 -> go_dec int bs' | W8_e == w8 || W8_E == w8 -> go_sci int 0 bs' | otherwise -> kont (NumInteger int) bs where int = byteStringToInteger (lbsTake n bs0) go_dec :: Integer -> ByteString -> r go_dec !int !bs1 = case LBS.uncons bs1 of Nothing -> errEnd Just (w8, bs') | W8_0 <= w8, w8 <= W8_9 -> state_dec 1 bs' | otherwise -> err $ "Unexpected " ++ show w8 ++ " while parsing number literal" where state_dec :: Int -> ByteString -> r state_dec !n !bs = case LBS.uncons bs of Nothing -> kont (NumDecimal dec) bs Just (w8, bs') | W8_0 <= w8, w8 <= W8_9 -> state_dec (n + 1) bs' | W8_e == w8 || W8_E == w8 -> go_sci coef (negate n) bs' | otherwise -> kont (NumDecimal dec) bs where frac = byteStringToInteger (lbsTake n bs1) coef = int * 10 ^ n + frac dec = Sci.scientific coef (negate n) go_sci :: Integer -> Int -> ByteString -> r go_sci !coef !exp10 !bs2 = case LBS.uncons bs2 of Nothing -> errEnd Just (w8, bs') | W8_0 <= w8, w8 <= W8_9 -> go_sci_pos coef exp10 bs2 1 bs' | W8_PLUS == w8 -> case LBS.uncons bs' of Nothing -> errEnd Just (w8', bs'') | W8_0 <= w8', w8' <= W8_9 -> go_sci_pos coef exp10 bs' 1 bs'' | otherwise -> errUnx w8' | W8_MINUS == w8 -> case LBS.uncons bs' of Nothing -> errEnd Just (w8', bs'') | W8_0 <= w8', w8' <= W8_9 -> go_sci_neg coef exp10 bs' 1 bs'' | otherwise -> errUnx w8' | otherwise -> errUnx w8 go_sci_pos :: Integer -> Int -> ByteString -> Int -> ByteString -> r go_sci_pos !coef !exp10 !bs2 !n !bs = case LBS.uncons bs of Nothing -> kont (NumScientific sci) bs Just (w8, bs') | W8_0 <= w8, w8 <= W8_9 -> go_sci_pos coef exp10 bs2 (n + 1) bs' | otherwise -> kont (NumScientific sci) bs where exp10' = fromInteger (byteStringToInteger (lbsTake n bs2)) sci = Sci.scientific coef (exp10 + exp10') go_sci_neg :: Integer -> Int -> ByteString -> Int -> ByteString -> r go_sci_neg !coef !exp10 !bs2 !n !bs = case LBS.uncons bs of Nothing -> kont (NumScientific sci) bs Just (w8, bs') | W8_0 <= w8, w8 <= W8_9 -> go_sci_neg coef exp10 bs2 (n + 1) bs' | otherwise -> kont (NumScientific sci) bs where exp10' = fromInteger (byteStringToInteger (lbsTake n bs2)) sci = Sci.scientific coef (exp10 - exp10') errEnd = err "Unexpected end-of-input while parsing number literal" errUnx w8 = err $ "Unexpected " ++ show (chr (fromIntegral w8)) ++ " while parsing number literal"