{-|
Description:    Extract basic semantic categories from a simple textual stream.

Copyright:      (c) 2020 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      provisional
Portability:    portable

This module and the internal branch it heads implement the "Tokenization"
section of the
__[HTML](https://html.spec.whatwg.org/multipage/parsing.html#tokenization)__
document parsing specification, processing a stream of text to add information
on, or group it by, semantic category.  This then allows the following stage to
base its logic on such higher-level concepts as "markup tag" or "comment"
without worrying about the (sometimes complex) escaping behaviour required to
parse them.
-}
module Web.Mangrove.Parse.Tokenize
    ( -- * Types
      -- ** Final
      Token ( .. )
    , BasicAttribute
    , TagParams ( .. )
    , emptyTagParams
    , DoctypeParams ( .. )
    , emptyDoctypeParams
      -- ** Intermediate
    , TokenizerState
    , CurrentTokenizerState ( .. )
    , Encoding ( .. )
      -- * Initialization
    , defaultTokenizerState
    , tokenizerMode
    , tokenizerStartTag
    , tokenizerEncoding
      -- * Transformations
    , tokenize
    , tokenizeStep
    , finalizeTokenizer
    ) where


import qualified Control.Applicative as A
import qualified Control.Monad.Trans.State as N.S

import qualified Data.Bifunctor as F.B
import qualified Data.ByteString as BS
import qualified Data.ByteString.Short as BS.SH
import qualified Data.Maybe as Y

import Web.Willow.DOM

import Web.Mangrove.Parse.Common.Error
import Web.Mangrove.Parse.Encoding.Preprocess
import Web.Mangrove.Parse.Tokenize.Common hiding ( setRemainder )
import Web.Mangrove.Parse.Tokenize.Dispatcher
import Web.Willow.Common.Encoding
import Web.Willow.Common.Encoding.Sniffer
import Web.Willow.Common.Parser


-- | __HTML:__
--      @[tokenization]
--      (https://html.spec.whatwg.org/multipage/parsing.html#tokenization)@
-- 
-- Given a starting environment, transform a binary document stream into a
-- stream of semantic atoms.  If the parse fails, returns all tokens before the
-- one which caused the error, but any trailing bytes are silently dropped.
tokenize
    :: TokenizerState
    -> BS.ByteString
    -> ([([ParseError], Token)], TokenizerState)
tokenize :: TokenizerState
-> ByteString -> ([([ParseError], Token)], TokenizerState)
tokenize TokenizerState
state ByteString
stream = ([([ParseError], Token)], TokenizerState, ByteString)
-> ([([ParseError], Token)], TokenizerState)
loop (([([ParseError], Token)], TokenizerState, ByteString)
 -> ([([ParseError], Token)], TokenizerState))
-> ([([ParseError], Token)], TokenizerState, ByteString)
-> ([([ParseError], Token)], TokenizerState)
forall a b. (a -> b) -> a -> b
$ TokenizerState
-> ByteString
-> ([([ParseError], Token)], TokenizerState, ByteString)
tokenizeStep TokenizerState
state ByteString
stream
  where loop :: ([([ParseError], Token)], TokenizerState, ByteString)
-> ([([ParseError], Token)], TokenizerState)
loop ([([ParseError], Token)]
ts, TokenizerState
state', ByteString
output)
            | ByteString -> Bool
BS.null ByteString
output = ([([ParseError], Token)]
ts, TokenizerState
state')
            | Bool
otherwise = ([([ParseError], Token)] -> [([ParseError], Token)])
-> ([([ParseError], Token)], TokenizerState)
-> ([([ParseError], Token)], TokenizerState)
forall (p :: * -> * -> *) a b c.
Bifunctor p =>
(a -> b) -> p a c -> p b c
F.B.first ([([ParseError], Token)]
ts [([ParseError], Token)]
-> [([ParseError], Token)] -> [([ParseError], Token)]
forall a. [a] -> [a] -> [a]
++) (([([ParseError], Token)], TokenizerState)
 -> ([([ParseError], Token)], TokenizerState))
-> (([([ParseError], Token)], TokenizerState, ByteString)
    -> ([([ParseError], Token)], TokenizerState))
-> ([([ParseError], Token)], TokenizerState, ByteString)
-> ([([ParseError], Token)], TokenizerState)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([([ParseError], Token)], TokenizerState, ByteString)
-> ([([ParseError], Token)], TokenizerState)
loop (([([ParseError], Token)], TokenizerState, ByteString)
 -> ([([ParseError], Token)], TokenizerState))
-> ([([ParseError], Token)], TokenizerState, ByteString)
-> ([([ParseError], Token)], TokenizerState)
forall a b. (a -> b) -> a -> b
$ TokenizerState
-> ByteString
-> ([([ParseError], Token)], TokenizerState, ByteString)
tokenizeStep TokenizerState
state' ByteString
output


-- | Parse a minimal number of bytes from an input stream, into a sequence of
-- semantic tokens.  Returns all data required to seamlessly resume parsing.
tokenizeStep
    :: TokenizerState
    -> BS.ByteString
    -> ([([ParseError], Token)], TokenizerState, BS.ByteString)
tokenizeStep :: TokenizerState
-> ByteString
-> ([([ParseError], Token)], TokenizerState, ByteString)
tokenizeStep TokenizerState
state ByteString
stream = case ParserT ByteString Maybe ([([ParseError], Token)], TokenizerState)
-> ByteString
-> Maybe (([([ParseError], Token)], TokenizerState), ByteString)
forall stream (gather :: * -> *) out.
ParserT stream gather out -> stream -> gather (out, stream)
runParserT (StateT TokenizerState (Parser ByteString) [([ParseError], Token)]
-> TokenizerState
-> ParserT
     ByteString Maybe ([([ParseError], Token)], TokenizerState)
forall s (m :: * -> *) a. StateT s m a -> s -> m (a, s)
N.S.runStateT StateT TokenizerState (Parser ByteString) [([ParseError], Token)]
dispatcher' TokenizerState
state) ByteString
stream of
    Just (([([ParseError], Token)]
out, TokenizerState
state'), ByteString
stream') -> ([([ParseError], Token)]
out, TokenizerState
state', ByteString
stream')
    Maybe (([([ParseError], Token)], TokenizerState), ByteString)
Nothing -> ([], TokenizerState
stateEof, ByteString
BS.empty)
  where stateEof :: TokenizerState
stateEof = TokenizerState
state
            { decoderState_ :: Either (Either SnifferEnvironment Encoding) (Maybe DecoderState)
decoderState_ = Maybe DecoderState
-> Either (Either SnifferEnvironment Encoding) (Maybe DecoderState)
forall a b. b -> Either a b
Right (Maybe DecoderState
 -> Either
      (Either SnifferEnvironment Encoding) (Maybe DecoderState))
-> Maybe DecoderState
-> Either (Either SnifferEnvironment Encoding) (Maybe DecoderState)
forall a b. (a -> b) -> a -> b
$ ShortByteString -> DecoderState -> DecoderState
setRemainder (ByteString -> ShortByteString
BS.SH.toShort ByteString
stream) (DecoderState -> DecoderState)
-> Maybe DecoderState -> Maybe DecoderState
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>
                TokenizerState -> ByteString -> Maybe DecoderState
decoderDefaultState TokenizerState
state ByteString
stream
            }

-- | Explicitly indicate that the input stream will not contain any further
-- bytes, and perform any finalization processing based on that.
finalizeTokenizer :: TokenizerState -> [([ParseError], Token)]
finalizeTokenizer :: TokenizerState -> [([ParseError], Token)]
finalizeTokenizer TokenizerState
state = ([([ParseError], Token)], TokenizerState)
-> [([ParseError], Token)]
forall a b. (a, b) -> a
fst (([([ParseError], Token)], TokenizerState)
 -> [([ParseError], Token)])
-> ([([ParseError], Token)], TokenizerState)
-> [([ParseError], Token)]
forall a b. (a -> b) -> a -> b
$ TokenizerState
-> ByteString -> ([([ParseError], Token)], TokenizerState)
tokenize TokenizerState
state' ByteString
BS.empty
  where state' :: TokenizerState
state' = TokenizerState
state
            { tokenParserState :: TokenParserState
tokenParserState = (TokenizerState -> TokenParserState
tokenParserState TokenizerState
state)
                { atEndOfStream :: Bool
atEndOfStream = Bool
True
                }
            }


-- | Given a string as emitted by the decoder and the final state of that
-- parser, repack it into a single list with that final state encapsulated in a
-- 'Just'; the 'init' of the string is given 'Nothing's.
repackStream :: ([([ParseError], Char)], DecoderState, BS.ByteString) -> [TokenizerInput]
repackStream :: ([([ParseError], Char)], DecoderState, ByteString)
-> [TokenizerInput]
repackStream ([], DecoderState
_, ByteString
_) = []
repackStream ([([ParseError]
errs, Char
c)], DecoderState
dState, ByteString
bs) =
    [[ParseError] -> Char -> DecoderOutputState -> TokenizerInput
TokenizerInput [ParseError]
errs Char
c (DecoderOutputState -> TokenizerInput)
-> DecoderOutputState -> TokenizerInput
forall a b. (a -> b) -> a -> b
$ (Maybe DecoderState, ByteString) -> DecoderOutputState
forall a. a -> Maybe a
Just (DecoderState -> Maybe DecoderState
forall a. a -> Maybe a
Just DecoderState
dState, ByteString
bs)]
repackStream (([ParseError]
errs, Char
c):[([ParseError], Char)]
cs, DecoderState
dState, ByteString
bs) =
    [ParseError] -> Char -> DecoderOutputState -> TokenizerInput
TokenizerInput [ParseError]
errs Char
c DecoderOutputState
forall a. Maybe a
Nothing TokenizerInput -> [TokenizerInput] -> [TokenizerInput]
forall a. a -> [a] -> [a]
: ([([ParseError], Char)], DecoderState, ByteString)
-> [TokenizerInput]
repackStream ([([ParseError], Char)]
cs, DecoderState
dState, ByteString
bs)


-- | Wrap the standard dispatcher to operate over a raw 'BS.ByteString'
-- rather than the "Web.Mangrove.Parse.Encoding" output.
dispatcher' :: StateParser TokenizerState BS.ByteString [([ParseError], Token)]
dispatcher' :: StateT TokenizerState (Parser ByteString) [([ParseError], Token)]
dispatcher' = do
    TokenizerState
state <- StateT TokenizerState (Parser ByteString) TokenizerState
forall (m :: * -> *) s. Monad m => StateT s m s
N.S.get
    ByteString
stream <- StateT TokenizerState (Parser ByteString) ByteString
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m stream
abridge
    let dState :: DecoderState
dState = DecoderState -> Maybe DecoderState -> DecoderState
forall a. a -> Maybe a -> a
Y.fromMaybe (Encoding -> DecoderState
initialDecoderState Encoding
Utf8) (Maybe DecoderState -> DecoderState)
-> Maybe DecoderState -> DecoderState
forall a b. (a -> b) -> a -> b
$ TokenizerState -> ByteString -> Maybe DecoderState
decoderDefaultState TokenizerState
state ByteString
stream
    TokenizerState
-> [TokenizerInput]
-> StateT
     TokenizerState (Parser ByteString) [([ParseError], Token)]
recurse TokenizerState
state ([TokenizerInput]
 -> StateT
      TokenizerState (Parser ByteString) [([ParseError], Token)])
-> [TokenizerInput]
-> StateT
     TokenizerState (Parser ByteString) [([ParseError], Token)]
forall a b. (a -> b) -> a -> b
$ DecoderState -> ByteString -> [TokenizerInput]
preprocessStep' DecoderState
dState ByteString
stream
  where preprocessStep' :: DecoderState -> ByteString -> [TokenizerInput]
preprocessStep' DecoderState
dState ByteString
input = case DecoderState
-> ByteString -> ([([ParseError], Char)], DecoderState, ByteString)
preprocessStep DecoderState
dState ByteString
input of
            ([], DecoderState
_, ByteString
_) -> []
            cs' :: ([([ParseError], Char)], DecoderState, ByteString)
cs'@([([ParseError], Char)]
_, DecoderState
dState', ByteString
input') -> ([([ParseError], Char)], DecoderState, ByteString)
-> [TokenizerInput]
repackStream ([([ParseError], Char)], DecoderState, ByteString)
cs' [TokenizerInput] -> [TokenizerInput] -> [TokenizerInput]
forall a. [a] -> [a] -> [a]
++ DecoderState -> ByteString -> [TokenizerInput]
preprocessStep' DecoderState
dState' ByteString
input'

-- | Loop the tokenization dispatcher until it returns a set of tokens which
-- happens to coincide with a decoder breakpoint.  Relies on lazy evaluation in
-- the stream generation to avoid forcing the entire thing at once, while still
-- retaining the capability to consume as much input as necessary to get the
-- parsers to line up.
recurse
    :: TokenizerState
    -> [TokenizerInput]
    -> StateParser TokenizerState BS.ByteString [([ParseError], Token)]
recurse :: TokenizerState
-> [TokenizerInput]
-> StateT
     TokenizerState (Parser ByteString) [([ParseError], Token)]
recurse TokenizerState
state [TokenizerInput]
stream = case ParserT
  [TokenizerInput] Maybe ([TokenizerOutput Token], TokenParserState)
-> [TokenizerInput]
-> Maybe
     (([TokenizerOutput Token], TokenParserState), [TokenizerInput])
forall stream (gather :: * -> *) out.
ParserT stream gather out -> stream -> gather (out, stream)
runParserT (StateT
  TokenParserState (Parser [TokenizerInput]) [TokenizerOutput Token]
-> TokenParserState
-> ParserT
     [TokenizerInput] Maybe ([TokenizerOutput Token], TokenParserState)
forall s (m :: * -> *) a. StateT s m a -> s -> m (a, s)
N.S.runStateT StateT
  TokenParserState (Parser [TokenizerInput]) [TokenizerOutput Token]
dispatcher (TokenParserState
 -> ParserT
      [TokenizerInput] Maybe ([TokenizerOutput Token], TokenParserState))
-> TokenParserState
-> ParserT
     [TokenizerInput] Maybe ([TokenizerOutput Token], TokenParserState)
forall a b. (a -> b) -> a -> b
$ TokenizerState -> TokenParserState
tokenParserState TokenizerState
state) [TokenizerInput]
stream of
    Maybe
  (([TokenizerOutput Token], TokenParserState), [TokenizerInput])
Nothing -> StateT TokenizerState (Parser ByteString) [([ParseError], Token)]
forall (f :: * -> *) a. Alternative f => f a
A.empty
    Just (([TokenizerOutput Token]
out, TokenParserState
tokState'), [TokenizerInput]
stream') -> case [TokenizerOutput Token] -> Maybe (TokenizerOutput Token)
forall a. [a] -> Maybe a
Y.listToMaybe ([TokenizerOutput Token] -> [TokenizerOutput Token]
forall a. [a] -> [a]
reverse [TokenizerOutput Token]
out) Maybe (TokenizerOutput Token)
-> (TokenizerOutput Token -> DecoderOutputState)
-> DecoderOutputState
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= TokenizerOutput Token -> DecoderOutputState
forall out. TokenizerOutput out -> DecoderOutputState
tokenizedState of
        DecoderOutputState
Nothing -> do
            let state' :: TokenizerState
state' = TokenizerState
state
                    { tokenParserState :: TokenParserState
tokenParserState = TokenParserState
tokState'
                    }
            TokenizerState -> StateT TokenizerState (Parser ByteString) ()
forall (m :: * -> *) s. Monad m => s -> StateT s m ()
N.S.put TokenizerState
state'
            [([ParseError], Token)]
out' <- TokenizerState
-> [TokenizerInput]
-> StateT
     TokenizerState (Parser ByteString) [([ParseError], Token)]
recurse TokenizerState
state' [TokenizerInput]
stream'
            [([ParseError], Token)]
-> StateT
     TokenizerState (Parser ByteString) [([ParseError], Token)]
forall (m :: * -> *) a. Monad m => a -> m a
return ([([ParseError], Token)]
 -> StateT
      TokenizerState (Parser ByteString) [([ParseError], Token)])
-> [([ParseError], Token)]
-> StateT
     TokenizerState (Parser ByteString) [([ParseError], Token)]
forall a b. (a -> b) -> a -> b
$ (TokenizerOutput Token -> ([ParseError], Token))
-> [TokenizerOutput Token] -> [([ParseError], Token)]
forall a b. (a -> b) -> [a] -> [b]
map TokenizerOutput Token -> ([ParseError], Token)
forall b. TokenizerOutput b -> ([ParseError], b)
repackOut [TokenizerOutput Token]
out [([ParseError], Token)]
-> [([ParseError], Token)] -> [([ParseError], Token)]
forall a. [a] -> [a] -> [a]
++ [([ParseError], Token)]
out'
        Just (Maybe DecoderState
dState, ByteString
dStream) -> do
            ByteString -> StateT TokenizerState (Parser ByteString) ()
forall (m :: * -> *) stream token.
MonadParser m stream token =>
stream -> m ()
pushChunk ByteString
dStream
            TokenizerState -> StateT TokenizerState (Parser ByteString) ()
forall (m :: * -> *) s. Monad m => s -> StateT s m ()
N.S.put (TokenizerState -> StateT TokenizerState (Parser ByteString) ())
-> TokenizerState -> StateT TokenizerState (Parser ByteString) ()
forall a b. (a -> b) -> a -> b
$ TokenizerState
state
                { decoderState_ :: Either (Either SnifferEnvironment Encoding) (Maybe DecoderState)
decoderState_ = Maybe DecoderState
-> Either (Either SnifferEnvironment Encoding) (Maybe DecoderState)
forall a b. b -> Either a b
Right Maybe DecoderState
dState
                , tokenParserState :: TokenParserState
tokenParserState = TokenParserState
tokState'
                }
            [([ParseError], Token)]
-> StateT
     TokenizerState (Parser ByteString) [([ParseError], Token)]
forall (m :: * -> *) a. Monad m => a -> m a
return ([([ParseError], Token)]
 -> StateT
      TokenizerState (Parser ByteString) [([ParseError], Token)])
-> [([ParseError], Token)]
-> StateT
     TokenizerState (Parser ByteString) [([ParseError], Token)]
forall a b. (a -> b) -> a -> b
$ (TokenizerOutput Token -> ([ParseError], Token))
-> [TokenizerOutput Token] -> [([ParseError], Token)]
forall a b. (a -> b) -> [a] -> [b]
map TokenizerOutput Token -> ([ParseError], Token)
forall b. TokenizerOutput b -> ([ParseError], b)
repackOut [TokenizerOutput Token]
out
  where repackOut :: TokenizerOutput b -> ([ParseError], b)
repackOut TokenizerOutput b
t' = (TokenizerOutput b -> [ParseError]
forall out. TokenizerOutput out -> [ParseError]
tokenizedErrs TokenizerOutput b
t', TokenizerOutput b -> b
forall out. TokenizerOutput out -> out
tokenizedOut TokenizerOutput b
t')


-- | Specify which section of the finite state machine describing the
-- tokenization algorithm should be active.
tokenizerMode :: CurrentTokenizerState -> TokenizerState -> TokenizerState
tokenizerMode :: CurrentTokenizerState -> TokenizerState -> TokenizerState
tokenizerMode CurrentTokenizerState
mode TokenizerState
state = TokenizerState
state
    { tokenParserState :: TokenParserState
tokenParserState = (TokenizerState -> TokenParserState
tokenParserState TokenizerState
state)
        { currentState :: CurrentTokenizerState
currentState = CurrentTokenizerState
mode
        }
    }

-- | Specify the data to use as the previous tag which had been emitted by the
-- tokenizer.  This only has to be called when required for external algorithms
-- or constructions; the parser automatically updates as required for generated
-- 'StartTag' tokens.
tokenizerStartTag :: Maybe Namespace -> ElementName -> TokenizerState -> TokenizerState
tokenizerStartTag :: Maybe Namespace -> Namespace -> TokenizerState -> TokenizerState
tokenizerStartTag Maybe Namespace
ns Namespace
name TokenizerState
state = TokenizerState
state
    { tokenParserState :: TokenParserState
tokenParserState = (TokenizerState -> TokenParserState
tokenParserState TokenizerState
state)
        { prevStartTag :: Maybe Namespace
prevStartTag = Namespace -> Maybe Namespace
forall a. a -> Maybe a
Just Namespace
name
        , currentNodeNamespace :: Maybe Namespace
currentNodeNamespace = Maybe Namespace
ns
        }
    }

-- | Specify the encoding scheme used by a given parse environment to read from
-- the binary input stream.  Note that this will always use the initial state
-- for the respective decoder; intermediate states as returned by 'decodeStep'
-- are not supported.
tokenizerEncoding :: Either SnifferEnvironment (Maybe Encoding) -> TokenizerState -> TokenizerState
tokenizerEncoding :: Either SnifferEnvironment (Maybe Encoding)
-> TokenizerState -> TokenizerState
tokenizerEncoding Either SnifferEnvironment (Maybe Encoding)
enc' TokenizerState
state = TokenizerState
state
    { decoderState_ :: Either (Either SnifferEnvironment Encoding) (Maybe DecoderState)
decoderState_ = case Either SnifferEnvironment (Maybe Encoding)
enc' of
        Right Maybe Encoding
Nothing -> Maybe DecoderState
-> Either (Either SnifferEnvironment Encoding) (Maybe DecoderState)
forall a b. b -> Either a b
Right Maybe DecoderState
forall a. Maybe a
Nothing
        Right (Just Encoding
enc) -> Either SnifferEnvironment Encoding
-> Either (Either SnifferEnvironment Encoding) (Maybe DecoderState)
forall a b. a -> Either a b
Left (Either SnifferEnvironment Encoding
 -> Either
      (Either SnifferEnvironment Encoding) (Maybe DecoderState))
-> Either SnifferEnvironment Encoding
-> Either (Either SnifferEnvironment Encoding) (Maybe DecoderState)
forall a b. (a -> b) -> a -> b
$ Encoding -> Either SnifferEnvironment Encoding
forall a b. b -> Either a b
Right Encoding
enc
        Left SnifferEnvironment
env -> Either SnifferEnvironment Encoding
-> Either (Either SnifferEnvironment Encoding) (Maybe DecoderState)
forall a b. a -> Either a b
Left (Either SnifferEnvironment Encoding
 -> Either
      (Either SnifferEnvironment Encoding) (Maybe DecoderState))
-> Either SnifferEnvironment Encoding
-> Either (Either SnifferEnvironment Encoding) (Maybe DecoderState)
forall a b. (a -> b) -> a -> b
$ SnifferEnvironment -> Either SnifferEnvironment Encoding
forall a b. a -> Either a b
Left SnifferEnvironment
env
    }