{-# LANGUAGE ExistentialQuantification          #-}
{-# LANGUAGE ScopedTypeVariables #-}

-- |
-- Module      : Streamly.Parser.Types
-- Copyright   : (c) 2020 Composewell Technologies
-- License     : BSD3
-- Maintainer  : streamly@composewell.com
-- Stability   : experimental
-- Portability : GHC
--
-- Streaming and backtracking parsers.
--
-- Parsers just extend folds.  Please read the 'Fold' design notes in
-- "Streamly.Internal.Data.Fold.Types" for background on the design.
--
-- = Parser Design
--
-- The 'Parser' type or a parsing fold is a generalization of the 'Fold' type.
-- The 'Fold' type /always/ succeeds on each input. Therefore, it does not need
-- to buffer the input. In contrast, a 'Parser' may fail and backtrack to
-- replay the input again to explore another branch of the parser. Therefore,
-- it needs to buffer the input. Therefore, a 'Parser' is a fold with some
-- additional requirements.  To summarize, unlike a 'Fold', a 'Parser':
--
-- 1. may not generate a new value of the accumulator on every input, it may
-- generate a new accumulator only after consuming multiple input elements
-- (e.g. takeEQ).
-- 2. on success may return some unconsumed input (e.g. takeWhile)
-- 3. may fail and return all input without consuming it (e.g. satisfy)
-- 4. backtrack and start inspecting the past input again (e.g. alt)
--
-- These use cases require buffering and replaying of input.  To facilitate
-- this, the step function of the 'Fold' is augmented to return the next state
-- of the fold along with a command tag using a 'Step' functor, the tag tells
-- the fold driver to manipulate the future input as the parser wishes. The
-- 'Step' functor provides the following commands to the fold driver
-- corresponding to the use cases outlined in the previous para:
--
-- 1. 'Skip': hold (buffer) the input or go back to a previous position in the stream
-- 2. 'Yield', 'Stop': tell how much input is unconsumed
-- 3. 'Error': indicates that the parser has failed without a result
--
-- = How a Parser Works?
--
-- A parser is just like a fold, it keeps consuming inputs from the stream and
-- accumulating them in an accumulator. The accumulator of the parser could be
-- a singleton value or it could be a collection of values e.g. a list.
--
-- The parser may build a new output value from multiple input items. When it
-- consumes an input item but needs more input to build a complete output item
-- it uses @Skip 0 s@, yielding the intermediate state @s@ and asking the
-- driver to provide more input.  When the parser determines that a new output
-- value is complete it can use a @Stop n b@ to terminate the parser with @n@
-- items of input unused and the final value of the accumulator returned as
-- @b@. If at any time the parser determines that the parse has failed it can
-- return @Error err@.
--
-- A parser building a collection of values (e.g. a list) can use the @Yield@
-- constructor whenever a new item in the output collection is generated. If a
-- parser building a collection of values has yielded at least one value then
-- it considered successful and cannot fail after that. In the current
-- implementation, this is not automatically enforced, there is a rule that the
-- parser MUST use only @Stop@ for termination after the first @Yield@, it
-- cannot use @Error@. It may be possible to change the implementation so that
-- this rule is not required, but there may be some performance cost to it.
--
-- 'Streamly.Internal.Data.Parser.takeWhile' and
-- 'Streamly.Internal.Data.Parser.some' combinators are good examples of
-- efficient implementations using all features of this representation.  It is
-- possible to idiomatically build a collection of parsed items using a
-- singleton parser and @Alternative@ instance instead of using a
-- multi-yield parser.  However, this implementation is amenable to stream
-- fusion and can therefore be much faster.
--
-- = Error Handling
--
-- When a parser's @step@ function is invoked it may iterminate by either a
-- 'Stop' or an 'Error' return value. In an 'Alternative' composition an error
-- return can make the composed parser backtrack and try another parser.
--
-- If the stream stops before a parser could terminate then we use the
-- @extract@ function of the parser to retrieve the last yielded value of the
-- parser. If the parser has yielded at least one value then @extract@ MUST
-- return a value without throwing an error, otherwise it uses the 'ParseError'
-- exception to throw an error.
--
-- We chose the exception throwing mechanism for @extract@ instead of using an
-- explicit error return via an 'Either' type for keeping the interface simple
-- as most of the time we do not need to catch the error in intermediate
-- layers. Note that we cannot use exception throwing mechanism in @step@
-- function because of performance reasons. 'Error' constructor in that case
-- allows loop fusion and better performance.
--
-- = Future Work
--
-- It may make sense to move "takeWhile" type of parsers, which cannot fail but
-- need some lookahead, to splitting folds.  This will allow such combinators
-- to be accepted where we need an unfailing "Fold" type.
--
-- Based on application requirements it should be possible to design even a
-- richer interface to manipulate the input stream/buffer. For example, we
-- could randomly seek into the stream in the forward or reverse directions or
-- we can even seek to the end or from the end or seek from the beginning.
--
-- We can distribute and scan/parse a stream using both folds and parsers and
-- merge the resulting streams using different merge strategies (e.g.
-- interleaving or serial).

module Streamly.Internal.Data.Parser.Types
    (
      Step (..)
    , Parser (..)
    , ParseError (..)

    , yield
    , yieldM
    , splitWith

    , die
    , dieM
    , splitSome
    , splitMany
    , alt
    )
where

import Control.Applicative (Alternative(..))
import Control.Exception (assert, Exception(..))
import Control.Monad (MonadPlus(..))
import Control.Monad.Catch (MonadCatch, try, throwM, MonadThrow)

import Fusion.Plugin.Types (Fuse(..))
import Streamly.Internal.Data.Fold (Fold(..), toList)
import Streamly.Internal.Data.Strict (Tuple3'(..))

-- | The return type of a 'Parser' step.
--
-- A parser is driven by a parse driver one step at a time, at any time the
-- driver may @extract@ the result of the parser. The parser may ask the driver
-- to backtrack at any point, therefore, the driver holds the input up to a
-- point of no return in a backtracking buffer.  The buffer grows or shrinks
-- based on the return values of the parser step execution.
--
-- When a parser step is executed it generates a new intermediate state of the
-- parse result along with a command to the driver. The command tells the
-- driver whether to keep the input stream for a potential backtracking later
-- on or drop it, and how much to keep. The constructors of 'Step' represent
-- the commands to the driver.
--
-- /Internal/
--
{-# ANN type Step Fuse #-}
data Step s b =
      Yield Int s
      -- ^ @Yield offset state@ indicates that the parser has yielded a new
      -- result which is a point of no return. The result can be extracted
      -- using @extract@. The driver drops the buffer except @offset@ elements
      -- before the current position in stream. The rule is that if a parser
      -- has yielded at least once it cannot return a failure result.

    | Skip Int s
    -- ^ @Skip offset state@ indicates that the parser has consumed the current
    -- input but no new result has been generated. A new @state@ is generated.
    -- However, if we use @extract@ on @state@ it will generate a result from
    -- the previous @Yield@.  When @offset@ is non-zero it is a backward offset
    -- from the current position in the stream from which the driver will feed
    -- the next input to the parser. The offset cannot be beyond the latest
    -- point of no return created by @Yield@.

    | Stop Int b
    -- ^ @Stop offset result@ asks the driver to stop driving the parser
    -- because it has reached a fixed point and further input will not change
    -- the result.  @offset@ is the count of unused elements which includes the
    -- element on which 'Stop' occurred.
    | Error String
    -- ^ An error makes the parser backtrack to the last checkpoint and try
    -- another alternative.

instance Functor (Step s) where
    {-# INLINE fmap #-}
    fmap _ (Yield n s) = Yield n s
    fmap _ (Skip n s) = Skip n s
    fmap f (Stop n b) = Stop n (f b)
    fmap _ (Error err) = Error err

-- | A parser is a fold that can fail and is represented as @Parser step
-- initial extract@. Before we drive a parser we call the @initial@ action to
-- retrieve the initial state of the fold. The parser driver invokes @step@
-- with the state returned by the previous step and the next input element. It
-- results into a new state and a command to the driver represented by 'Step'
-- type. The driver keeps invoking the step function until it stops or fails.
-- At any point of time the driver can call @extract@ to inspect the result of
-- the fold. It may result in an error or an output value.
--
-- /Internal/
--
data Parser m a b =
    forall s. Parser (s -> a -> m (Step s b)) (m s) (s -> m b)

-- | This exception is used for two purposes:
--
-- * When a parser ultimately fails, the user of the parser is intimated via
--    this exception.
-- * When the "extract" function of a parser needs to throw an error.
--
-- /Internal/
--
newtype ParseError = ParseError String
    deriving Show

instance Exception ParseError where
    displayException (ParseError err) = err

instance Functor m => Functor (Parser m a) where
    {-# INLINE fmap #-}
    fmap f (Parser step1 initial extract) =
        Parser step initial (fmap2 f extract)

        where

        step s b = fmap2 f (step1 s b)
        fmap2 g = fmap (fmap g)

-- This is the dual of stream "yield".
--
-- | A parser that always yields a pure value without consuming any input.
--
-- /Internal/
--
{-# INLINE yield #-}
yield :: Monad m => b -> Parser m a b
yield b = Parser (\_ _ -> pure $ Stop 1 b)  -- step
                 (pure ())                  -- initial
                 (\_ -> pure b)             -- extract

-- This is the dual of stream "yieldM".
--
-- | A parser that always yields the result of an effectful action without
-- consuming any input.
--
-- /Internal/
--
{-# INLINE yieldM #-}
yieldM :: Monad m => m b -> Parser m a b
yieldM b = Parser (\_ _ -> Stop 1 <$> b) -- step
                  (pure ())              -- initial
                  (\_ -> b)              -- extract

-------------------------------------------------------------------------------
-- Sequential applicative
-------------------------------------------------------------------------------

{-# ANN type SeqParseState Fuse #-}
data SeqParseState sl f sr = SeqParseL sl | SeqParseR f sr

-- Note: this implementation of splitWith is fast because of stream fusion but
-- has quadratic time complexity, because each composition adds a new branch
-- that each subsequent parse's input element has to go through, therefore, it
-- cannot scale to a large number of compositions. After around 100
-- compositions the performance starts dipping rapidly beyond a CPS style
-- unfused implementation.
--
-- | Sequential application. Apply two parsers sequentially to an input stream.
-- The input is provided to the first parser, when it is done the remaining
-- input is provided to the second parser. If both the parsers succeed their
-- outputs are combined using the supplied function. The operation fails if any
-- of the parsers fail.
--
-- This undoes an "append" of two streams, it splits the streams using two
-- parsers and zips the results.
--
-- This implementation is strict in the second argument, therefore, the
-- following will fail:
--
-- >>> S.parse (PR.satisfy (> 0) *> undefined) $ S.fromList [1]
--
-- /Internal/
--
{-# INLINE splitWith #-}
splitWith :: Monad m
    => (a -> b -> c) -> Parser m x a -> Parser m x b -> Parser m x c
splitWith func (Parser stepL initialL extractL)
               (Parser stepR initialR extractR) =
    Parser step initial extract

    where

    initial = SeqParseL <$> initialL

    -- Note: For the composed parse to terminate, the left parser has to be
    -- a terminating parser returning a Stop at some point.
    step (SeqParseL st) a = do
        r <- stepL st a
        case r of
            -- Note: this leads to buffering even if we are not in an
            -- Alternative composition.
            Yield _ s -> return $ Skip 0 (SeqParseL s)
            Skip n s -> return $ Skip n (SeqParseL s)
            Stop n b -> Skip n <$> (SeqParseR (func b) <$> initialR)
            Error err -> return $ Error err

    step (SeqParseR f st) a = do
        r <- stepR st a
        return $ case r of
            Yield n s -> Yield n (SeqParseR f s)
            Skip n s -> Skip n (SeqParseR f s)
            Stop n b -> Stop n (f b)
            Error err -> Error err

    extract (SeqParseR f sR) = fmap f (extractR sR)
    extract (SeqParseL sL) = do
        rL <- extractL sL
        sR <- initialR
        rR <- extractR sR
        return $ func rL rR

-- | 'Applicative' form of 'splitWith'.
instance Monad m => Applicative (Parser m a) where
    {-# INLINE pure #-}
    pure = yield

    {-# INLINE (<*>) #-}
    (<*>) = splitWith id

-------------------------------------------------------------------------------
-- Sequential Alternative
-------------------------------------------------------------------------------

{-# ANN type AltParseState Fuse #-}
data AltParseState sl sr = AltParseL Int sl | AltParseR sr

-- Note: this implementation of alt is fast because of stream fusion but has
-- quadratic time complexity, because each composition adds a new branch that
-- each subsequent alternative's input element has to go through, therefore, it
-- cannot scale to a large number of compositions
--
-- | Sequential alternative. Apply the input to the first parser and return the
-- result if the parser succeeds. If the first parser fails then backtrack and
-- apply the same input to the second parser and return the result.
--
-- Note: This implementation is not lazy in the second argument. The following
-- will fail:
--
-- >>> S.parse (PR.satisfy (> 0) `PR.alt` undefined) $ S.fromList [1..10]
--
-- /Internal/
--
{-# INLINE alt #-}
alt :: Monad m => Parser m x a -> Parser m x a -> Parser m x a
alt (Parser stepL initialL extractL) (Parser stepR initialR extractR) =
    Parser step initial extract

    where

    initial = AltParseL 0 <$> initialL

    -- Once a parser yields at least one value it cannot fail.  This
    -- restriction helps us make backtracking more efficient, as we do not need
    -- to keep the consumed items buffered after a yield. Note that we do not
    -- enforce this and if a misbehaving parser does not honor this then we can
    -- get unexpected results.
    step (AltParseL cnt st) a = do
        r <- stepL st a
        case r of
            Yield n s -> return $ Yield n (AltParseL 0 s)
            Skip n s -> do
                assert (cnt + 1 - n >= 0) (return ())
                return $ Skip n (AltParseL (cnt + 1 - n) s)
            Stop n b -> return $ Stop n b
            Error _ -> do
                rR <- initialR
                return $ Skip (cnt + 1) (AltParseR rR)

    step (AltParseR st) a = do
        r <- stepR st a
        return $ case r of
            Yield n s -> Yield n (AltParseR s)
            Skip n s -> Skip n (AltParseR s)
            Stop n b -> Stop n b
            Error err -> Error err

    extract (AltParseR sR) = extractR sR
    extract (AltParseL _ sL) = extractL sL

-- | See documentation of 'Streamly.Internal.Data.Parser.many'.
--
-- /Internal/
--
{-# INLINE splitMany #-}
splitMany :: MonadCatch m => Fold m b c -> Parser m a b -> Parser m a c
splitMany (Fold fstep finitial fextract) (Parser step1 initial1 extract1) =
    Parser step initial extract

    where

    initial = do
        ps <- initial1 -- parse state
        fs <- finitial -- fold state
        pure (Tuple3' ps 0 fs)

    {-# INLINE step #-}
    step (Tuple3' st cnt fs) a = do
        r <- step1 st a
        let cnt1 = cnt + 1
        case r of
            Yield _ s -> return $ Skip 0 (Tuple3' s cnt1 fs)
            Skip n s -> do
                assert (cnt1 - n >= 0) (return ())
                return $ Skip n (Tuple3' s (cnt1 - n) fs)
            Stop n b -> do
                s <- initial1
                fs1 <- fstep fs b
                -- XXX we need to yield and backtrack here
                return $ Skip n (Tuple3' s 0 fs1)
            Error _ -> do
                xs <- fextract fs
                return $ Stop cnt1 xs

    -- XXX The "try" may impact performance if this parser is used as a scan
    extract (Tuple3' s _ fs) = do
        r <- try $ extract1 s
        case r of
            Left (_ :: ParseError) -> fextract fs
            Right b -> fstep fs b >>= fextract

-- | See documentation of 'Streamly.Internal.Data.Parser.some'.
--
-- /Internal/
--
{-# INLINE splitSome #-}
splitSome :: MonadCatch m => Fold m b c -> Parser m a b -> Parser m a c
splitSome (Fold fstep finitial fextract) (Parser step1 initial1 extract1) =
    Parser step initial extract

    where

    initial = do
        ps <- initial1 -- parse state
        fs <- finitial -- fold state
        pure (Tuple3' ps 0 (Left fs))

    {-# INLINE step #-}
    step (Tuple3' st _ (Left fs)) a = do
        r <- step1 st a
        case r of
            Yield _ s -> return $ Skip 0 (Tuple3' s undefined (Left fs))
            Skip  n s -> return $ Skip n (Tuple3' s undefined (Left fs))
            Stop n b -> do
                s <- initial1
                fs1 <- fstep fs b
                -- XXX this is also a yield point, we will never fail beyond
                -- this point. If we do not yield then if an error occurs after
                -- this then we will backtrack to the previous yield point
                -- instead of this point which is wrong.
                --
                -- so we need a yield with backtrack
                return $ Skip n (Tuple3' s 0 (Right fs1))
            Error err -> return $ Error err
    step (Tuple3' st cnt (Right fs)) a = do
        r <- step1 st a
        let cnt1 = cnt + 1
        case r of
            Yield _ s -> return $ Yield 0 (Tuple3' s cnt1 (Right fs))
            Skip n s -> do
                assert (cnt1 - n >= 0) (return ())
                return $ Skip n (Tuple3' s (cnt1 - n) (Right fs))
            Stop n b -> do
                s <- initial1
                fs1 <- fstep fs b
                -- XXX we need to yield here but also backtrack
                return $ Skip n (Tuple3' s 0 (Right fs1))
            Error _ -> Stop cnt1 <$> fextract fs

    -- XXX The "try" may impact performance if this parser is used as a scan
    extract (Tuple3' s _ (Left fs)) = extract1 s >>= fstep fs >>= fextract
    extract (Tuple3' s _ (Right fs)) = do
        r <- try $ extract1 s
        case r of
            Left (_ :: ParseError) -> fextract fs
            Right b -> fstep fs b >>= fextract

-- This is the dual of "nil".
--
-- | A parser that always fails with an error message without consuming
-- any input.
--
-- /Internal/
--
{-# INLINE die #-}
die :: MonadThrow m => String -> Parser m a b
die err =
    Parser (\_ _ -> pure $ Error err)      -- step
           (pure ())                       -- initial
           (\_ -> throwM $ ParseError err) -- extract

-- This is the dual of "nilM".
--
-- | A parser that always fails with an effectful error message and without
-- consuming any input.
--
-- /Internal/
--
{-# INLINE dieM #-}
dieM :: MonadThrow m => m String -> Parser m a b
dieM err =
    Parser (\_ _ -> Error <$> err)         -- step
           (pure ())                       -- initial
           (\_ -> err >>= throwM . ParseError) -- extract

-- Note: The default implementations of "some" and "many" loop infinitely
-- because of the strict pattern match on both the arguments in applicative and
-- alternative. With the direct style parser type we cannot use the mutually
-- recursive definitions of "some" and "many".
--
-- Note: With the direct style parser type, the list in "some" and "many" is
-- accumulated strictly, it cannot be consumed lazily.

-- | 'Alternative' instance using 'alt'.
--
-- Note: The implementation of '<|>' is not lazy in the second
-- argument. The following code will fail:
--
-- >>> S.parse (PR.satisfy (> 0) <|> undefined) $ S.fromList [1..10]
--
instance MonadCatch m => Alternative (Parser m a) where
    {-# INLINE empty #-}
    empty = die "empty"

    {-# INLINE (<|>) #-}
    (<|>) = alt

    {-# INLINE many #-}
    many = splitMany toList

    {-# INLINE some #-}
    some = splitSome toList

{-# ANN type ConcatParseState Fuse #-}
data ConcatParseState sl p = ConcatParseL sl | ConcatParseR p

-- Note: The monad instance has quadratic performance complexity. It works fine
-- for small number of compositions but for a scalable implementation we need a
-- CPS version.

-- | Monad composition can be used for lookbehind parsers, we can make the
-- future parses depend on the previously parsed values.
--
-- If we have to parse "a9" or "9a" but not "99" or "aa" we can use the
-- following parser:
--
-- @
-- backtracking :: MonadCatch m => PR.Parser m Char String
-- backtracking =
--     sequence [PR.satisfy isDigit, PR.satisfy isAlpha]
--     '<|>'
--     sequence [PR.satisfy isAlpha, PR.satisfy isDigit]
-- @
--
-- We know that if the first parse resulted in a digit at the first place then
-- the second parse is going to fail.  However, we waste that information and
-- parse the first character again in the second parse only to know that it is
-- not an alphabetic char.  By using lookbehind in a 'Monad' composition we can
-- avoid redundant work:
--
-- @
-- data DigitOrAlpha = Digit Char | Alpha Char
--
-- lookbehind :: MonadCatch m => PR.Parser m Char String
-- lookbehind = do
--     x1 \<-    Digit '<$>' PR.satisfy isDigit
--          '<|>' Alpha '<$>' PR.satisfy isAlpha
--
--     -- Note: the parse depends on what we parsed already
--     x2 <- case x1 of
--         Digit _ -> PR.satisfy isAlpha
--         Alpha _ -> PR.satisfy isDigit
--
--     return $ case x1 of
--         Digit x -> [x,x2]
--         Alpha x -> [x,x2]
-- @
--
instance Monad m => Monad (Parser m a) where
    {-# INLINE return #-}
    return = pure

    -- (>>=) :: Parser m a b -> (b -> Parser m a c) -> Parser m a c
    {-# INLINE (>>=) #-}
    (Parser stepL initialL extractL) >>= func = Parser step initial extract

        where

        initial = ConcatParseL <$> initialL

        step (ConcatParseL st) a = do
            r <- stepL st a
            return $ case r of
                Yield _ s -> Skip 0 (ConcatParseL s)
                Skip n s -> Skip n (ConcatParseL s)
                Stop n b -> Skip n (ConcatParseR (func b))
                Error err -> Error err

        step (ConcatParseR (Parser stepR initialR extractR)) a = do
            st <- initialR
            r <- stepR st a
            return $ case r of
                Yield n s ->
                    Yield n (ConcatParseR (Parser stepR (return s) extractR))
                Skip n s ->
                    Skip n (ConcatParseR (Parser stepR (return s) extractR))
                Stop n b -> Stop n b
                Error err -> Error err

        extract (ConcatParseR (Parser _ initialR extractR)) =
            initialR >>= extractR

        extract (ConcatParseL sL) = extractL sL >>= f . func

            where

            f (Parser _ initialR extractR) = initialR >>= extractR

-- | 'mzero' is same as 'empty', it aborts the parser. 'mplus' is same as
-- '<|>', it selects the first succeeding parser.
--
-- /Internal/
--
instance MonadCatch m => MonadPlus (Parser m a) where
    {-# INLINE mzero #-}
    mzero = die "mzero"

    {-# INLINE mplus #-}
    mplus = alt