{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE MultiWayIf #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Zenacy.HTML.Internal.Lexer
( Lexer(..)
, LexerOptions(..)
, LexerSkip(..)
, lexerNew
, lexerSetRCDATA
, lexerSetRAWTEXT
, lexerSetPLAINTEXT
, lexerSetScriptData
, lexerSkipNextLF
, lexerNext
) where
import Zenacy.HTML.Internal.BS
import Zenacy.HTML.Internal.Buffer
import Zenacy.HTML.Internal.Char
import Zenacy.HTML.Internal.Core
import Zenacy.HTML.Internal.Entity
import Zenacy.HTML.Internal.Token
import Control.Monad
( forM
, forM_
, mapM
, when
)
import Control.Monad.Extra
( whenM
, whenJust
)
import Control.Monad.ST
( ST
)
import Data.STRef
( STRef
, newSTRef
)
import Data.DList
( DList
)
import qualified Data.DList as D
( empty
, snoc
, toList
)
import Data.Map
( Map
)
import qualified Data.Map as Map
( fromList
, lookup
)
import Data.Maybe
( isJust
)
import Data.Word
( Word8
)
import Data.Default
( Default(..)
)
import Data.Vector.Storable.Mutable
( MVector(..)
)
import qualified Data.Vector.Storable.Mutable as U
( new
, length
, read
, write
, grow
)
import Debug.Trace (trace)
data LexerOptions = LexerOptions
{ lexerOptionInput :: BS
, lexerOptionLogErrors :: Bool
, lexerOptionIgnoreEntities :: Bool
} deriving (Show)
data LexerSkip
= LexerSkipNone
| LexerSkipLF
deriving (Eq, Ord, Show)
data Lexer s = Lexer
{ lexerData :: BS
, lexerIgnore :: Bool
, lexerLog :: Bool
, lexerOffset :: STRef s Int
, lexerToken :: STRef s (TokenBuffer s)
, lexerBuffer :: STRef s (Buffer s)
, lexerLast :: STRef s [Word8]
, lexerState :: STRef s LexerState
, lexerReturn :: STRef s LexerState
, lexerSkip :: STRef s LexerSkip
, lexerErrors :: STRef s (DList BS)
, lexerCode :: STRef s Int
}
instance Default LexerOptions where
def = LexerOptions
{ lexerOptionInput = bsEmpty
, lexerOptionLogErrors = False
, lexerOptionIgnoreEntities = False
}
data LexerState
= StateData
| StateRCDATA
| StateRAWTEXT
| StateScriptData
| StatePLAINTEXT
| StateTagOpen
| StateEndTagOpen
| StateTagName
| StateRCDATALessThan
| StateRCDATAEndTagOpen
| StateRCDATAEndTagName
| StateRAWTEXTLessThan
| StateRAWTEXTEndTagOpen
| StateRAWTEXTEndTagName
| StateScriptDataLessThan
| StateScriptDataEndTagOpen
| StateScriptDataEndTagName
| StateScriptDataEscapeStart
| StateScriptDataEscapeStartDash
| StateScriptDataEscaped
| StateScriptDataEscapedDash
| StateScriptDataEscapedDashDash
| StateScriptDataEscapedLessThan
| StateScriptDataEscapedEndTagOpen
| StateScriptDataEscapedEndTagName
| StateScriptDataDoubleEscapedStart
| StateScriptDataDoubleEscaped
| StateScriptDataDoubleEscapedDash
| StateScriptDataDoubleEscapedDashDash
| StateScriptDataDoubleEscapedLessThan
| StateScriptDataDoubleEscapeEnd
| StateBeforeAttrName
| StateAttrName
| StateAfterAttrName
| StateBeforeAttrValue
| StateAttrValueDoubleQuoted
| StateAttrValueSingleQuoted
| StateAttrValueUnquoted
| StateAfterAttrValue
| StateSelfClosingStartTag
| StateBogusComment
| StateMarkupDeclarationOpen
| StateCommentStart
| StateCommentStartDash
| StateComment
| StateCommentLessThan
| StateCommentLessThanBang
| StateCommentLessThanBangDash
| StateCommentLessThanBangDashDash
| StateCommentEndDash
| StateCommentEnd
| StateCommentEndBang
| StateDoctype
| StateBeforeDoctypeName
| StateDoctypeName
| StateAfterDoctypeName
| StateAfterDoctypePublicKeyword
| StateBeforeDoctypePublicId
| StateDoctypePublicIdDoubleQuoted
| StateDoctypePublicIdSingleQuoted
| StateAfterDoctypePublicId
| StateBetweenDoctypePublicAndSystem
| StateAfterDoctypeSystemKeyword
| StateBeforeDoctypeSystemId
| StateDoctypeSystemIdDoubleQuoted
| StateDoctypeSystemIdSingleQuoted
| StateAfterDoctypeSystemId
| StateBogusDoctype
| StateCDATASection
| StateCDATASectionBracket
| StateCDATASectionEnd
| StateCharacterReference
| StateNamedCharacterReference
| StateAmbiguousAmpersand
| StateNumericCharacterReference
| StateHexCharacterReferenceStart
| StateDecimalCharacterReferenceStart
| StateHexCharacterReference
| StateDecimalCharacterReference
| StateNumericCharacterReferenceEnd
deriving (Show, Eq, Ord)
lexerDispatch :: LexerState -> Lexer s -> ST s ()
lexerDispatch = \case
StateData ->
doData
StateRCDATA ->
doRCDATA
StateRAWTEXT ->
doRAWTEXT
StateScriptData ->
doScriptData
StatePLAINTEXT ->
doPLAINTEXT
StateTagOpen ->
doTagOpen
StateEndTagOpen ->
doEndTagOpen
StateTagName ->
doTagName
StateRCDATALessThan ->
doRCDATALessThan
StateRCDATAEndTagOpen ->
doRCDATAEndTagOpen
StateRCDATAEndTagName ->
doRCDATAEndTagName
StateRAWTEXTLessThan ->
doRAWTEXTLessThan
StateRAWTEXTEndTagOpen ->
doRAWTEXTEndTagOpen
StateRAWTEXTEndTagName ->
doRAWTEXTEndTagName
StateScriptDataLessThan ->
doScriptDataLessThan
StateScriptDataEndTagOpen ->
doScriptDataEndTagOpen
StateScriptDataEndTagName ->
doScriptDataEndTagName
StateScriptDataEscapeStart ->
doScriptDataEscapeStart
StateScriptDataEscapeStartDash ->
doScriptDataEscapeStartDash
StateScriptDataEscaped ->
doScriptDataEscaped
StateScriptDataEscapedDash ->
doScriptDataEscapedDash
StateScriptDataEscapedDashDash ->
doScriptDataEscapedDashDash
StateScriptDataEscapedLessThan ->
doScriptDataEscapedLessThan
StateScriptDataEscapedEndTagOpen ->
doScriptDataEscapedEndTagOpen
StateScriptDataEscapedEndTagName ->
doScriptDataEscapedEndTagName
StateScriptDataDoubleEscapedStart ->
doScriptDataDoubleEscapedStart
StateScriptDataDoubleEscaped ->
doScriptDataDoubleEscaped
StateScriptDataDoubleEscapedDash ->
doScriptDataDoubleEscapedDash
StateScriptDataDoubleEscapedDashDash ->
doScriptDataDoubleEscapedDashDash
StateScriptDataDoubleEscapedLessThan ->
doScriptDataDoubleEscapedLessThan
StateScriptDataDoubleEscapeEnd ->
doScriptDataDoubleEscapeEnd
StateBeforeAttrName ->
doBeforeAttrName
StateAttrName ->
doAttrName
StateAfterAttrName ->
doAfterAttrName
StateBeforeAttrValue ->
doBeforeAttrValue
StateAttrValueDoubleQuoted ->
doAttrValueDoubleQuoted
StateAttrValueSingleQuoted ->
doAttrValueSingleQuoted
StateAttrValueUnquoted ->
doAttrValueUnquoted
StateAfterAttrValue ->
doAfterAttrValue
StateSelfClosingStartTag ->
doSelfClosingStartTag
StateBogusComment ->
doBogusComment
StateMarkupDeclarationOpen ->
doMarkupDeclarationOpen
StateCommentStart ->
doCommentStart
StateCommentStartDash ->
doCommentStartDash
StateComment ->
doComment
StateCommentLessThan ->
doCommentLessThan
StateCommentLessThanBang ->
doCommentLessThanBang
StateCommentLessThanBangDash ->
doCommentLessThanBangDash
StateCommentLessThanBangDashDash ->
doCommentLessThanBangDashDash
StateCommentEndDash ->
doCommentEndDash
StateCommentEnd ->
doCommentEnd
StateCommentEndBang ->
doCommentEndBang
StateDoctype ->
doDoctype
StateBeforeDoctypeName ->
doBeforeDoctypeName
StateDoctypeName ->
doDoctypeName
StateAfterDoctypeName ->
doAfterDoctypeName
StateAfterDoctypePublicKeyword ->
doAfterDoctypePublicKeyword
StateBeforeDoctypePublicId ->
doBeforeDoctypePublicId
StateDoctypePublicIdDoubleQuoted ->
doDoctypePublicIdDoubleQuoted
StateDoctypePublicIdSingleQuoted ->
doDoctypePublicIdSingleQuoted
StateAfterDoctypePublicId ->
doAfterDoctypePublicId
StateBetweenDoctypePublicAndSystem ->
doBetweenDoctypePublicAndSystem
StateAfterDoctypeSystemKeyword ->
doAfterDoctypeSystemKeyword
StateBeforeDoctypeSystemId ->
doBeforeDoctypeSystemId
StateDoctypeSystemIdDoubleQuoted ->
doDoctypeSystemIdDoubleQuoted
StateDoctypeSystemIdSingleQuoted ->
doDoctypeSystemIdSingleQuoted
StateAfterDoctypeSystemId ->
doAfterDoctypeSystemId
StateBogusDoctype ->
doBogusDoctype
StateCDATASection ->
doCDATASection
StateCDATASectionBracket ->
doCDATASectionBracket
StateCDATASectionEnd ->
doCDATASectionEnd
StateCharacterReference ->
doCharacterReference
StateNamedCharacterReference ->
doNamedCharacterReference
StateAmbiguousAmpersand ->
doAmbiguousAmpersand
StateNumericCharacterReference ->
doNumericCharacterReference
StateHexCharacterReferenceStart ->
doHexCharacterReferenceStart
StateDecimalCharacterReferenceStart ->
doDecimalCharacterReferenceStart
StateHexCharacterReference ->
doHexCharacterReference
StateDecimalCharacterReference ->
doDecimalCharacterReference
StateNumericCharacterReferenceEnd ->
doNumericCharacterReferenceEnd
lexerNew :: LexerOptions -> ST s (Either BS (Lexer s))
lexerNew o@LexerOptions{..}
| Just i <- bsElemIndex 0 lexerOptionInput =
pure $ Left $ bsConcat [ "input contains null at ", bcPack $ show i ]
| otherwise =
Right <$> lexerMake o
lexerMake :: LexerOptions -> ST s (Lexer s)
lexerMake LexerOptions{..} = do
offset <- newSTRef 0
state <- newSTRef StateData
ret <- newSTRef StateData
token <- tokenBuffer
buffer <- bufferNew
last <- newSTRef []
skip <- newSTRef LexerSkipNone
errors <- newSTRef D.empty
code <- newSTRef 0
pure $ Lexer
{ lexerData = lexerOptionInput
, lexerIgnore = lexerOptionIgnoreEntities
, lexerLog = lexerOptionLogErrors
, lexerOffset = offset
, lexerToken = token
, lexerBuffer = buffer
, lexerLast = last
, lexerState = state
, lexerReturn = ret
, lexerSkip = skip
, lexerErrors = errors
, lexerCode = code
}
lexerSetRCDATA :: Lexer s -> ST s ()
lexerSetRCDATA Lexer{..} = wref lexerState StateRCDATA
lexerSetRAWTEXT :: Lexer s -> ST s ()
lexerSetRAWTEXT Lexer{..} = wref lexerState StateRAWTEXT
lexerSetPLAINTEXT :: Lexer s -> ST s ()
lexerSetPLAINTEXT Lexer{..} = wref lexerState StatePLAINTEXT
lexerSetScriptData :: Lexer s -> ST s ()
lexerSetScriptData Lexer{..} = wref lexerState StateScriptData
lexerSkipNextLF :: Lexer s -> ST s ()
lexerSkipNextLF Lexer{..} = wref lexerSkip LexerSkipLF
lexerNext :: Lexer s -> ST s Token
lexerNext x @ Lexer {..} =
skip
where
skip = do
t <- next
case t of
TChar 0x10 ->
rref lexerSkip >>= \case
LexerSkipLF ->
wref lexerSkip LexerSkipNone >> skip
LexerSkipNone ->
pure t
_otherwise ->
pure t
next = do
i <- tokenNext lexerToken
if i == 0
then do
tokenReset lexerToken
s <- rref lexerState
lexerDispatch s x
i <- tokenFirst lexerToken
if i == 0
then pure TEOF
else tokenPack i lexerToken
else do
tokenPack i lexerToken
nextWord :: Lexer s -> ST s Word8
nextWord x @ Lexer {..} = do
offset <- rref lexerOffset
if | offset < bsLen lexerData -> do
wref lexerOffset (offset + 1)
pure $ bsIndex lexerData offset
| otherwise ->
pure chrEOF
peekWord :: Lexer s -> ST s Word8
peekWord x @ Lexer {..} = do
offset <- rref lexerOffset
if | offset < bsLen lexerData ->
pure $ bsIndex lexerData offset
| otherwise ->
pure chrEOF
backWord :: Lexer s -> ST s ()
backWord x @ Lexer {..} = uref lexerOffset (subtract 1)
skipWord :: Lexer s -> Int -> ST s ()
skipWord x @ Lexer {..} n = uref lexerOffset (+n)
dataIndexer :: Lexer s -> ST s (Int -> Word8)
dataIndexer x @ Lexer {..} = do
offset <- rref lexerOffset
pure $ \i -> bsIndex lexerData (offset + i)
dataRemain :: Lexer s -> ST s Int
dataRemain x @ Lexer {..} = do
offset <- rref lexerOffset
pure $ bsLen lexerData - offset
emit :: Lexer s -> ST s ()
emit x @ Lexer {..} = do
tokenTail lexerToken >>= flip tokenTagStartName lexerToken >>= \case
Just a -> wref lexerLast a
Nothing -> pure ()
emitChar :: Lexer s -> Word8 -> ST s ()
emitChar x @ Lexer {..} w = do
tokenCharInit w lexerToken
emit x
emitBuffer :: Lexer s -> ST s ()
emitBuffer x @ Lexer {..} = do
bufferApply (emitChar x) lexerBuffer
bufferReset lexerBuffer
state :: Lexer s -> LexerState -> ST s ()
state x @ Lexer {..} = wref lexerState
returnSet :: Lexer s -> LexerState -> ST s ()
returnSet x @ Lexer {..} = wref lexerReturn
returnGet :: Lexer s -> ST s LexerState
returnGet x @ Lexer {..} = rref lexerReturn
returnState :: Lexer s -> ST s ()
returnState x = returnGet x >>= flip lexerDispatch x
parseError :: Lexer s -> BS -> ST s ()
parseError x @ Lexer {..} =
when lexerLog . (uref lexerErrors . flip D.snoc)
appropriateEndTag :: Lexer s -> ST s Bool
appropriateEndTag x @ Lexer {..} = do
tokenTail lexerToken >>= flip tokenTagEndName lexerToken >>= \case
Just a -> (==a) <$> rref lexerLast
Nothing -> pure False
consumingAttibute :: Lexer s -> ST s Bool
consumingAttibute x @ Lexer {..} = do
a <- returnGet x
pure $ any (==a)
[ StateAttrValueDoubleQuoted
, StateAttrValueSingleQuoted
, StateAttrValueUnquoted
]
flushCodePoints :: Lexer s -> ST s ()
flushCodePoints x @ Lexer {..} = do
a <- consumingAttibute x
if | a -> do
bufferApply (flip tokenAttrValAppend lexerToken) lexerBuffer
bufferReset lexerBuffer
| otherwise -> do
emitBuffer x
doData :: Lexer s -> ST s ()
doData x @ Lexer {..} = do
c <- nextWord x
if | c == chrAmpersand -> do
returnSet x StateData
doCharacterReference x
| c == chrLess -> do
doTagOpen x
| c == chrEOF -> do
tokenEOFInit lexerToken
emit x
| otherwise -> do
emitChar x c
doData x
doRCDATA :: Lexer s -> ST s ()
doRCDATA x @ Lexer {..} = do
c <- nextWord x
if | c == chrAmpersand -> do
returnSet x StateRCDATA
doCharacterReference x
| c == chrLess -> do
doRCDATALessThan x
| c == chrEOF -> do
tokenEOFInit lexerToken
emit x
| otherwise -> do
emitChar x c
doRCDATA x
doRAWTEXT :: Lexer s -> ST s ()
doRAWTEXT x @ Lexer {..} = do
c <- nextWord x
if | c == chrLess -> do
doRAWTEXTLessThan x
| c == chrEOF -> do
tokenEOFInit lexerToken
emit x
| otherwise -> do
emitChar x c
doRAWTEXT x
doScriptData :: Lexer s -> ST s ()
doScriptData x @ Lexer {..} = do
c <- nextWord x
if | c == chrLess -> do
doScriptDataLessThan x
| c == chrEOF -> do
tokenEOFInit lexerToken
emit x
| otherwise -> do
emitChar x c
doScriptData x
doPLAINTEXT :: Lexer s -> ST s ()
doPLAINTEXT x @ Lexer {..} = do
c <- nextWord x
if | c == chrEOF -> do
tokenEOFInit lexerToken
emit x
| otherwise -> do
emitChar x c
doPLAINTEXT x
doTagOpen :: Lexer s -> ST s ()
doTagOpen x @ Lexer {..} = do
c <- nextWord x
if | c == chrExclamation -> do
doMarkupDeclarationOpen x
| c == chrSolidus -> do
doEndTagOpen x
| chrASCIIAlpha c -> do
tokenTagStartInit lexerToken
backWord x
doTagName x
| c == chrQuestion -> do
parseError x "unexpected-question-mark-instead-of-tag-name"
tokenCommentInit lexerToken
backWord x
doBogusComment x
| c == chrEOF -> do
parseError x "eof-before-tag-name"
tokenCharInit chrLess lexerToken
tokenEOFInit lexerToken
emit x
| otherwise -> do
parseError x "invalid-first-character-of-tag-name"
backWord x
doData x
doEndTagOpen :: Lexer s -> ST s ()
doEndTagOpen x @ Lexer {..} = do
c <- nextWord x
if | chrASCIIAlpha c -> do
tokenTagEndInit lexerToken
backWord x
doTagName x
| c == chrGreater -> do
parseError x "missing-end-tag-name"
doData x
| c == chrEOF -> do
parseError x "eof-before-tag-name"
tokenCharInit chrLess lexerToken
tokenCharInit chrSolidus lexerToken
tokenEOFInit lexerToken
emit x
| otherwise -> do
parseError x "invalid-first-character-of-tag-name"
tokenCommentInit lexerToken
backWord x
doBogusComment x
doTagName :: Lexer s -> ST s ()
doTagName x @ Lexer {..} = do
c <- nextWord x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace -> do
doBeforeAttrName x
| c == chrSolidus -> do
doSelfClosingStartTag x
| c == chrGreater -> do
state x StateData
emit x
| chrASCIIUpperAlpha c -> do
tokenTagNameAppend (chrToLower c) lexerToken
doTagName x
| c == chrEOF -> do
parseError x "eof-in-tag"
tokenEOFInit lexerToken
emit x
| otherwise -> do
tokenTagNameAppend c lexerToken
doTagName x
doRCDATALessThan :: Lexer s -> ST s ()
doRCDATALessThan x @ Lexer {..} = do
c <- nextWord x
if | c == chrSolidus -> do
bufferReset lexerBuffer
doRCDATAEndTagOpen x
| otherwise -> do
tokenCharInit chrLess lexerToken
backWord x
doRCDATA x
doRCDATAEndTagOpen :: Lexer s -> ST s ()
doRCDATAEndTagOpen x @ Lexer {..} = do
c <- nextWord x
if | chrASCIIAlpha c -> do
tokenTagEndInit lexerToken
backWord x
doRCDATAEndTagName x
| otherwise -> do
tokenCharInit chrLess lexerToken
tokenCharInit chrSolidus lexerToken
backWord x
doRCDATA x
doRCDATAEndTagName :: Lexer s -> ST s ()
doRCDATAEndTagName x @ Lexer {..} = do
c <- nextWord x
a <- appropriateEndTag x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace -> do
if a
then do
doBeforeAttrName x
else do
anythingElse
| c == chrSolidus -> do
if a
then do
doSelfClosingStartTag x
else do
anythingElse
| c == chrGreater -> do
if a
then do
state x StateData
emit x
else do
anythingElse
| chrASCIIUpperAlpha c -> do
tokenTagNameAppend (chrToLower c) lexerToken
bufferAppend c lexerBuffer
doRCDATAEndTagName x
| chrASCIILowerAlpha c -> do
tokenTagNameAppend c lexerToken
bufferAppend c lexerBuffer
doRCDATAEndTagName x
| otherwise -> do
anythingElse
where
anythingElse = do
tokenDrop lexerToken
emitChar x chrLess
emitChar x chrSolidus
emitBuffer x
backWord x
doRCDATA x
doRAWTEXTLessThan :: Lexer s -> ST s ()
doRAWTEXTLessThan x @ Lexer {..} = do
c <- nextWord x
if | c == chrSolidus -> do
bufferReset lexerBuffer
doRAWTEXTEndTagOpen x
| otherwise -> do
tokenCharInit chrLess lexerToken
backWord x
doRAWTEXT x
doRAWTEXTEndTagOpen :: Lexer s -> ST s ()
doRAWTEXTEndTagOpen x @ Lexer {..} = do
c <- nextWord x
if | chrASCIIAlpha c -> do
tokenTagEndInit lexerToken
backWord x
doRAWTEXTEndTagName x
| otherwise -> do
emitChar x chrLess
emitChar x chrSolidus
backWord x
doRAWTEXT x
doRAWTEXTEndTagName :: Lexer s -> ST s ()
doRAWTEXTEndTagName x @ Lexer {..} = do
c <- nextWord x
a <- appropriateEndTag x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace -> do
if a
then do
doBeforeAttrName x
else do
anythingElse
| c == chrSolidus -> do
if a
then do
doSelfClosingStartTag x
else do
anythingElse
| c == chrGreater -> do
if a
then do
state x StateData
emit x
else do
anythingElse
| chrASCIIUpperAlpha c -> do
tokenTagNameAppend (chrToLower c) lexerToken
bufferAppend c lexerBuffer
doRAWTEXTEndTagName x
| chrASCIILowerAlpha c -> do
tokenTagNameAppend c lexerToken
bufferAppend c lexerBuffer
doRAWTEXTEndTagName x
| otherwise -> do
anythingElse
where
anythingElse = do
tokenDrop lexerToken
emitChar x chrLess
emitChar x chrSolidus
emitBuffer x
backWord x
doRAWTEXT x
doScriptDataLessThan :: Lexer s -> ST s ()
doScriptDataLessThan x @ Lexer {..} = do
c <- nextWord x
if | c == chrSolidus -> do
bufferReset lexerBuffer
doScriptDataEndTagOpen x
| c == chrExclamation -> do
tokenCharInit chrLess lexerToken
tokenCharInit chrExclamation lexerToken
doScriptDataEscapeStart x
| otherwise -> do
tokenCharInit chrLess lexerToken
backWord x
doScriptData x
doScriptDataEndTagOpen :: Lexer s -> ST s ()
doScriptDataEndTagOpen x @ Lexer {..} = do
c <- nextWord x
if | chrASCIIAlpha c -> do
tokenTagEndInit lexerToken
backWord x
doScriptDataEndTagName x
| otherwise -> do
emitChar x chrLess
emitChar x chrSolidus
backWord x
doScriptData x
doScriptDataEndTagName :: Lexer s -> ST s ()
doScriptDataEndTagName x @ Lexer {..} = do
c <- nextWord x
a <- appropriateEndTag x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace -> do
if a
then do
doBeforeAttrName x
else do
anythingElse
| c == chrSolidus -> do
if a
then do
doSelfClosingStartTag x
else do
anythingElse
| c == chrGreater -> do
if a
then do
state x StateData
emit x
else do
anythingElse
| chrASCIIUpperAlpha c -> do
tokenTagNameAppend (chrToLower c) lexerToken
bufferAppend c lexerBuffer
doScriptDataEndTagName x
| chrASCIILowerAlpha c -> do
tokenTagNameAppend c lexerToken
bufferAppend c lexerBuffer
doScriptDataEndTagName x
| otherwise -> do
anythingElse
where
anythingElse = do
tokenDrop lexerToken
emitChar x chrLess
emitChar x chrSolidus
emitBuffer x
backWord x
doScriptData x
doScriptDataEscapeStart :: Lexer s -> ST s ()
doScriptDataEscapeStart x @ Lexer {..} = do
c <- nextWord x
if | c == chrHyphen -> do
emitChar x chrHyphen
doScriptDataEscapeStartDash x
| otherwise -> do
backWord x
doScriptData x
doScriptDataEscapeStartDash :: Lexer s -> ST s ()
doScriptDataEscapeStartDash x @ Lexer {..} = do
c <- nextWord x
if | c == chrHyphen -> do
emitChar x chrHyphen
doScriptDataEscapedDashDash x
| otherwise -> do
backWord x
doScriptData x
doScriptDataEscaped :: Lexer s -> ST s ()
doScriptDataEscaped x @ Lexer {..} = do
c <- nextWord x
if | c == chrHyphen -> do
emitChar x chrHyphen
doScriptDataEscapedDash x
| c == chrLess -> do
doScriptDataEscapedLessThan x
| c == chrEOF -> do
parseError x "eof-in-script-html-comment-like-text"
tokenEOFInit lexerToken
emit x
| otherwise -> do
emitChar x c
doScriptDataEscaped x
doScriptDataEscapedDash :: Lexer s -> ST s ()
doScriptDataEscapedDash x @ Lexer {..} = do
c <- nextWord x
if | c == chrHyphen -> do
emitChar x chrHyphen
doScriptDataEscapedDashDash x
| c == chrLess -> do
doScriptDataEscapedLessThan x
| c == chrEOF -> do
parseError x "eof-in-script-html-comment-like-text"
tokenEOFInit lexerToken
emit x
| otherwise -> do
emitChar x c
doScriptDataEscaped x
doScriptDataEscapedDashDash :: Lexer s -> ST s ()
doScriptDataEscapedDashDash x @ Lexer {..} = do
c <- nextWord x
if | c == chrHyphen -> do
emitChar x c
doScriptDataEscapedDashDash x
| c == chrLess -> do
doScriptDataEscapedLessThan x
| c == chrGreater -> do
emitChar x c
doScriptData x
| c == chrEOF -> do
parseError x "eof-in-script-html-comment-like-text"
tokenEOFInit lexerToken
emit x
| otherwise -> do
emitChar x c
doScriptDataEscaped x
doScriptDataEscapedLessThan :: Lexer s -> ST s ()
doScriptDataEscapedLessThan x @ Lexer {..} = do
c <- nextWord x
if | c == chrSolidus -> do
bufferReset lexerBuffer
doScriptDataEscapedEndTagOpen x
| chrASCIIAlpha c -> do
bufferReset lexerBuffer
emitChar x chrLess
backWord x
doScriptDataDoubleEscapedStart x
| otherwise -> do
emitChar x chrLess
backWord x
doScriptDataEscaped x
doScriptDataEscapedEndTagOpen :: Lexer s -> ST s ()
doScriptDataEscapedEndTagOpen x @ Lexer {..} = do
c <- nextWord x
if | chrASCIIAlpha c -> do
tokenTagEndInit lexerToken
backWord x
doScriptDataEscapedEndTagName x
| otherwise -> do
emitChar x chrLess
emitChar x chrSolidus
backWord x
doScriptDataEscaped x
doScriptDataEscapedEndTagName :: Lexer s -> ST s ()
doScriptDataEscapedEndTagName x @ Lexer {..} = do
c <- nextWord x
a <- appropriateEndTag x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace -> do
if a
then do
doBeforeAttrName x
else do
anythingElse
| c == chrSolidus -> do
if a
then do
doSelfClosingStartTag x
else do
anythingElse
| c == chrGreater -> do
if a
then do
state x StateData
emit x
else do
anythingElse
| chrASCIIUpperAlpha c -> do
tokenTagNameAppend (chrToLower c) lexerToken
bufferAppend c lexerBuffer
doScriptDataEscapedEndTagName x
| chrASCIILowerAlpha c -> do
tokenTagNameAppend c lexerToken
bufferAppend c lexerBuffer
doScriptDataEscapedEndTagName x
| otherwise -> do
anythingElse
where
anythingElse = do
tokenDrop lexerToken
emitChar x chrLess
emitChar x chrSolidus
emitBuffer x
backWord x
state x StateScriptDataEscaped
emit x
doScriptDataDoubleEscapedStart :: Lexer s -> ST s ()
doScriptDataDoubleEscapedStart x @ Lexer {..} = do
c <- nextWord x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace ||
c == chrSolidus ||
c == chrGreater -> do
bufferContains (bsUnpack "script") lexerBuffer >>= \case
True -> do
doScriptDataDoubleEscaped x
False -> do
tokenCharInit c lexerToken
state x StateScriptDataEscaped
emit x
| chrASCIIUpperAlpha c -> do
bufferAppend (chrToLower c) lexerBuffer
tokenCharInit c lexerToken
state x StateScriptDataDoubleEscapedStart
emit x
| chrASCIILowerAlpha c -> do
bufferAppend c lexerBuffer
tokenCharInit c lexerToken
state x StateScriptDataDoubleEscapedStart
emit x
| otherwise -> do
backWord x
doScriptDataEscaped x
doScriptDataDoubleEscaped :: Lexer s -> ST s ()
doScriptDataDoubleEscaped x @ Lexer {..} = do
c <- nextWord x
if | c == chrHyphen -> do
tokenCharInit c lexerToken
state x StateScriptDataDoubleEscapedDash
emit x
| c == chrLess -> do
tokenCharInit c lexerToken
state x StateScriptDataDoubleEscapedLessThan
emit x
| c == chrEOF -> do
parseError x "eof-in-script-html-comment-like-text"
tokenEOFInit lexerToken
emit x
| otherwise -> do
tokenCharInit c lexerToken
emit x
doScriptDataDoubleEscapedDash :: Lexer s -> ST s ()
doScriptDataDoubleEscapedDash x @ Lexer {..} = do
c <- nextWord x
if | c == chrHyphen -> do
tokenCharInit c lexerToken
state x StateScriptDataDoubleEscapedDashDash
emit x
| c == chrLess -> do
tokenCharInit c lexerToken
state x StateScriptDataDoubleEscapedLessThan
emit x
| c == chrEOF -> do
parseError x "eof-in-script-html-comment-like-text"
tokenEOFInit lexerToken
emit x
| otherwise -> do
tokenCharInit c lexerToken
state x StateScriptDataDoubleEscaped
emit x
doScriptDataDoubleEscapedDashDash :: Lexer s -> ST s ()
doScriptDataDoubleEscapedDashDash x @ Lexer {..} = do
c <- nextWord x
if | c == chrHyphen -> do
emitChar x c
doScriptDataDoubleEscapedDashDash x
| c == chrLess -> do
tokenCharInit c lexerToken
state x StateScriptDataDoubleEscapedLessThan
emit x
| c == chrGreater -> do
tokenCharInit c lexerToken
state x StateScriptData
emit x
| c == chrEOF -> do
parseError x "eof-in-script-html-comment-like-text"
tokenEOFInit lexerToken
emit x
| otherwise -> do
tokenCharInit c lexerToken
state x StateScriptDataDoubleEscaped
emit x
doScriptDataDoubleEscapedLessThan :: Lexer s -> ST s ()
doScriptDataDoubleEscapedLessThan x @ Lexer {..} = do
c <- nextWord x
if | c == chrSolidus -> do
bufferReset lexerBuffer
tokenCharInit c lexerToken
state x StateScriptDataDoubleEscapeEnd
emit x
| otherwise -> do
backWord x
doScriptDataDoubleEscaped x
doScriptDataDoubleEscapeEnd :: Lexer s -> ST s ()
doScriptDataDoubleEscapeEnd x @ Lexer {..} = do
c <- nextWord x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace ||
c == chrSolidus ||
c == chrGreater -> do
bufferContains (bsUnpack "script") lexerBuffer >>= \case
True -> do
doScriptDataEscaped x
False -> do
tokenCharInit c lexerToken
state x StateScriptDataDoubleEscaped
emit x
| chrASCIIUpperAlpha c -> do
bufferAppend (chrToLower c) lexerBuffer
emitChar x c
doScriptDataDoubleEscapeEnd x
| chrASCIILowerAlpha c -> do
bufferAppend c lexerBuffer
emitChar x c
doScriptDataDoubleEscapeEnd x
| otherwise -> do
backWord x
doScriptDataDoubleEscaped x
doBeforeAttrName :: Lexer s -> ST s ()
doBeforeAttrName x @ Lexer {..} = do
c <- nextWord x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace -> do
doBeforeAttrName x
| c == chrSolidus ||
c == chrGreater ||
c == chrEOF -> do
backWord x
doAfterAttrName x
| c == chrEqual -> do
parseError x "unexpected-equals-sign-before-attribute-name"
tokenAttrInit lexerToken
tokenAttrNameAppend c lexerToken
doAttrName x
| otherwise -> do
tokenAttrInit lexerToken
backWord x
doAttrName x
doAttrName :: Lexer s -> ST s ()
doAttrName x @ Lexer {..} = do
c <- nextWord x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace ||
c == chrSolidus ||
c == chrGreater ||
c == chrEOF -> do
checkAttr
backWord x
doAfterAttrName x
| c == chrEqual -> do
checkAttr
doBeforeAttrValue x
| chrASCIIUpperAlpha c -> do
tokenAttrNameAppend (chrToLower c) lexerToken
doAttrName x
| c == chrQuote ||
c == chrApostrophe ||
c == chrLess -> do
parseError x "unexpected-character-in-attribute-name"
tokenAttrNameAppend c lexerToken
doAttrName x
| otherwise -> do
tokenAttrNameAppend c lexerToken
doAttrName x
where
checkAttr = do
i <- tokenTail lexerToken
whenM (tokenAttrNamePrune i lexerToken) $ do
parseError x "duplicate-attribute"
doAfterAttrName :: Lexer s -> ST s ()
doAfterAttrName x @ Lexer {..} = do
c <- nextWord x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace -> do
doAfterAttrName x
| c == chrSolidus -> do
doSelfClosingStartTag x
| c == chrEqual -> do
doBeforeAttrValue x
| c == chrGreater -> do
state x StateData
emit x
| c == chrEOF -> do
parseError x "eof-in-tag"
tokenEOFInit lexerToken
emit x
| otherwise -> do
tokenAttrInit lexerToken
backWord x
doAttrName x
doBeforeAttrValue :: Lexer s -> ST s ()
doBeforeAttrValue x @ Lexer {..} = do
c <- nextWord x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace -> do
doBeforeAttrValue x
| c == chrQuote -> do
doAttrValueDoubleQuoted x
| c == chrApostrophe -> do
doAttrValueSingleQuoted x
| c == chrGreater -> do
parseError x "missing-attribute-value"
state x StateData
emit x
| otherwise -> do
backWord x
doAttrValueUnquoted x
doAttrValueDoubleQuoted :: Lexer s -> ST s ()
doAttrValueDoubleQuoted x @ Lexer {..} = do
c <- nextWord x
if | c == chrQuote -> do
doAfterAttrValue x
| c == chrAmpersand -> do
returnSet x StateAttrValueDoubleQuoted
doCharacterReference x
| c == chrEOF -> do
parseError x "eof-in-tag"
tokenEOFInit lexerToken
emit x
| otherwise -> do
tokenAttrValAppend c lexerToken
doAttrValueDoubleQuoted x
doAttrValueSingleQuoted :: Lexer s -> ST s ()
doAttrValueSingleQuoted x @ Lexer {..} = do
c <- nextWord x
if | c == chrApostrophe -> do
doAfterAttrValue x
| c == chrAmpersand -> do
returnSet x StateAttrValueSingleQuoted
doCharacterReference x
| c == chrEOF -> do
parseError x "eof-in-tag"
tokenEOFInit lexerToken
emit x
| otherwise -> do
tokenAttrValAppend c lexerToken
doAttrValueSingleQuoted x
doAttrValueUnquoted :: Lexer s -> ST s ()
doAttrValueUnquoted x @ Lexer {..} = do
c <- nextWord x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace -> do
doBeforeAttrName x
| c == chrAmpersand -> do
returnSet x StateAttrValueUnquoted
doCharacterReference x
| c == chrGreater -> do
state x StateData
emit x
| c == chrQuote ||
c == chrApostrophe ||
c == chrLess ||
c == chrEqual ||
c == chrGrave -> do
parseError x "unexpected-character-in-unquoted-attribute-value"
tokenAttrValAppend c lexerToken
doAttrValueUnquoted x
| c == chrEOF -> do
parseError x "eof-in-tag"
tokenEOFInit lexerToken
emit x
| otherwise -> do
tokenAttrValAppend c lexerToken
doAttrValueUnquoted x
doAfterAttrValue :: Lexer s -> ST s ()
doAfterAttrValue x @ Lexer {..} = do
c <- nextWord x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace -> do
doBeforeAttrName x
| c == chrSolidus -> do
doSelfClosingStartTag x
| c == chrGreater -> do
state x StateData
emit x
| c == chrEOF -> do
parseError x "eof-in-tag"
tokenEOFInit lexerToken
emit x
| otherwise -> do
parseError x "missing-whitespace-between-attributes"
backWord x
doBeforeAttrName x
doSelfClosingStartTag :: Lexer s -> ST s ()
doSelfClosingStartTag x @ Lexer {..} = do
c <- nextWord x
if | c == chrGreater -> do
tokenTagStartSetSelfClosing lexerToken
state x StateData
emit x
| c == chrEOF -> do
parseError x "eof-in-tag"
tokenEOFInit lexerToken
emit x
| otherwise -> do
parseError x "unexpected-solidus-in-tag"
backWord x
doBeforeAttrName x
doBogusComment :: Lexer s -> ST s ()
doBogusComment x @ Lexer {..} = do
c <- nextWord x
if | c == chrGreater -> do
state x StateData
emit x
| c == chrEOF -> do
tokenEOFInit lexerToken
emit x
| otherwise -> do
tokenCommentAppend c lexerToken
doBogusComment x
doMarkupDeclarationOpen :: Lexer s -> ST s ()
doMarkupDeclarationOpen x @ Lexer {..} = do
f <- dataIndexer x
n <- dataRemain x
if | n > 1 &&
f 0 == chrHyphen &&
f 1 == chrHyphen -> do
skipWord x 2
tokenCommentInit lexerToken
doCommentStart x
| n > 6 &&
(f 0 == 0x44 || f 0 == 0x64) &&
(f 1 == 0x4F || f 1 == 0x6F) &&
(f 2 == 0x43 || f 2 == 0x63) &&
(f 3 == 0x54 || f 3 == 0x74) &&
(f 4 == 0x59 || f 4 == 0x79) &&
(f 5 == 0x50 || f 5 == 0x70) &&
(f 6 == 0x45 || f 6 == 0x65) -> do
skipWord x 7
doDoctype x
| n > 6 &&
f 0 == 0x5B &&
f 1 == 0x43 &&
f 2 == 0x44 &&
f 3 == 0x41 &&
f 4 == 0x54 &&
f 5 == 0x41 &&
f 6 == 0x5B -> do
skipWord x 7
if True
then do
state x StateCDATASection
doCDATASection x
else do
parseError x "cdata-in-html-content"
tokenCommentInit lexerToken
mapM_ (flip tokenCommentAppend lexerToken)
[ 0x5B, 0x43, 0x44, 0x41, 0x54, 0x41, 0x5B ]
doBogusComment x
| otherwise -> do
parseError x "incorrectly-opened-comment"
tokenCommentInit lexerToken
doBogusComment x
doCommentStart :: Lexer s -> ST s ()
doCommentStart x @ Lexer {..} = do
c <- nextWord x
if | c == chrHyphen -> do
doCommentStartDash x
| c == chrGreater -> do
parseError x "abrupt-closing-of-empty-comment"
state x StateData
emit x
| otherwise -> do
backWord x
doComment x
doCommentStartDash :: Lexer s -> ST s ()
doCommentStartDash x @ Lexer {..} = do
c <- nextWord x
if | c == chrHyphen -> do
doCommentEnd x
| c == chrGreater -> do
parseError x "abrupt-closing-of-empty-comment"
state x StateData
emit x
| c == chrEOF -> do
parseError x "eof-in-comment"
tokenEOFInit lexerToken
emit x
| otherwise -> do
tokenCommentAppend chrHyphen lexerToken
backWord x
doComment x
doComment :: Lexer s -> ST s ()
doComment x @ Lexer {..} = do
c <- nextWord x
if | c == chrLess -> do
tokenCommentAppend c lexerToken
doCommentLessThan x
| c == chrHyphen -> do
doCommentEndDash x
| c == chrEOF -> do
parseError x "eof-in-comment"
tokenEOFInit lexerToken
emit x
| otherwise -> do
tokenCommentAppend c lexerToken
doComment x
doCommentLessThan :: Lexer s -> ST s ()
doCommentLessThan x @ Lexer {..} = do
c <- nextWord x
if | c == chrExclamation -> do
tokenCommentAppend c lexerToken
doCommentLessThanBang x
| c == chrLess -> do
tokenCommentAppend c lexerToken
doCommentLessThan x
| otherwise -> do
backWord x
doComment x
doCommentLessThanBang :: Lexer s -> ST s ()
doCommentLessThanBang x @ Lexer {..} = do
c <- nextWord x
if | c == chrHyphen -> do
doCommentLessThanBangDash x
| otherwise -> do
backWord x
doComment x
doCommentLessThanBangDash :: Lexer s -> ST s ()
doCommentLessThanBangDash x @ Lexer {..} = do
c <- nextWord x
if | c == chrHyphen -> do
doCommentLessThanBangDashDash x
| otherwise -> do
backWord x
doCommentEndDash x
doCommentLessThanBangDashDash :: Lexer s -> ST s ()
doCommentLessThanBangDashDash x @ Lexer {..} = do
c <- nextWord x
if | c == chrHyphen ||
c == chrEOF -> do
backWord x
doCommentEnd x
| otherwise -> do
parseError x "nested-comment"
backWord x
doCommentEnd x
doCommentEndDash :: Lexer s -> ST s ()
doCommentEndDash x @ Lexer {..} = do
c <- nextWord x
if | c == chrHyphen -> do
doCommentEnd x
| c == chrEOF -> do
parseError x "eof-in-comment"
tokenEOFInit lexerToken
emit x
| otherwise -> do
tokenCommentAppend chrHyphen lexerToken
backWord x
doComment x
doCommentEnd :: Lexer s -> ST s ()
doCommentEnd x @ Lexer {..} = do
c <- nextWord x
if | c == chrGreater -> do
state x StateData
emit x
| c == chrExclamation -> do
doCommentEndBang x
| c == chrHyphen -> do
tokenCommentAppend chrHyphen lexerToken
doCommentEnd x
| c == chrEOF -> do
parseError x "eof-in-comment"
tokenEOFInit lexerToken
emit x
| otherwise -> do
tokenCommentAppend chrHyphen lexerToken
tokenCommentAppend chrHyphen lexerToken
backWord x
doComment x
doCommentEndBang :: Lexer s -> ST s ()
doCommentEndBang x @ Lexer {..} = do
c <- nextWord x
if | c == chrGreater -> do
state x StateData
emit x
| c == chrHyphen -> do
tokenCommentAppend chrHyphen lexerToken
tokenCommentAppend chrHyphen lexerToken
tokenCommentAppend chrExclamation lexerToken
doCommentEndDash x
| c == chrGreater -> do
parseError x "incorrectly-closed-comment"
state x StateData
emit x
| c == chrEOF -> do
parseError x "eof-in-comment"
tokenEOFInit lexerToken
emit x
| otherwise -> do
tokenCommentAppend chrHyphen lexerToken
tokenCommentAppend chrHyphen lexerToken
tokenCommentAppend chrExclamation lexerToken
backWord x
doComment x
doDoctype :: Lexer s -> ST s ()
doDoctype x @ Lexer {..} = do
c <- nextWord x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace -> do
doBeforeDoctypeName x
| c == chrGreater -> do
backWord x
doBeforeDoctypeName x
| c == chrEOF -> do
parseError x "eof-in-doctype"
tokenDoctypeInit lexerToken
tokenDoctypeSetForceQuirks lexerToken
tokenEOFInit lexerToken
emit x
| otherwise -> do
parseError x "missing-whitespace-before-doctype-name"
backWord x
doBeforeDoctypeName x
doBeforeDoctypeName :: Lexer s -> ST s ()
doBeforeDoctypeName x @ Lexer {..} = do
c <- nextWord x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace -> do
doBeforeDoctypeName x
| chrASCIIUpperAlpha c -> do
tokenDoctypeInit lexerToken
tokenDoctypeNameAppend (chrToLower c) lexerToken
doDoctypeName x
| c == chrGreater -> do
parseError x "missing-doctype-name"
tokenDoctypeInit lexerToken
tokenDoctypeSetForceQuirks lexerToken
state x StateData
emit x
| c == chrEOF -> do
parseError x "eof-in-doctype"
tokenDoctypeInit lexerToken
tokenDoctypeSetForceQuirks lexerToken
tokenEOFInit lexerToken
emit x
| otherwise -> do
tokenDoctypeInit lexerToken
tokenDoctypeNameAppend c lexerToken
doDoctypeName x
doDoctypeName :: Lexer s -> ST s ()
doDoctypeName x @ Lexer {..} = do
c <- nextWord x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace -> do
doAfterDoctypeName x
| c == chrGreater -> do
state x StateData
emit x
| chrASCIIUpperAlpha c -> do
tokenDoctypeNameAppend (chrToLower c) lexerToken
doDoctypeName x
| c == chrEOF -> do
parseError x "eof-in-doctype"
tokenDoctypeSetForceQuirks lexerToken
tokenEOFInit lexerToken
emit x
| otherwise -> do
tokenDoctypeNameAppend c lexerToken
doDoctypeName x
doAfterDoctypeName :: Lexer s -> ST s ()
doAfterDoctypeName x @ Lexer {..} = do
c <- nextWord x
f <- dataIndexer x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace -> do
doAfterDoctypeName x
| c == chrGreater -> do
state x StateData
emit x
| c == chrEOF -> do
parseError x "eof-in-doctype"
tokenDoctypeSetForceQuirks lexerToken
tokenEOFInit lexerToken
emit x
| (c == 0x50 || c == 0x70) &&
(f 0 == 0x55 || f 0 == 0x75) &&
(f 1 == 0x42 || f 1 == 0x62) &&
(f 2 == 0x4C || f 2 == 0x6C) &&
(f 3 == 0x49 || f 3 == 0x69) &&
(f 4 == 0x43 || f 4 == 0x63) -> do
skipWord x 5
doAfterDoctypePublicKeyword x
| (c == 0x53 || c == 0x73) &&
(f 0 == 0x59 || f 0 == 0x79) &&
(f 1 == 0x53 || f 1 == 0x73) &&
(f 2 == 0x54 || f 2 == 0x74) &&
(f 3 == 0x45 || f 3 == 0x65) &&
(f 4 == 0x4D || f 4 == 0x6D) -> do
skipWord x 5
doAfterDoctypeSystemKeyword x
| otherwise -> do
parseError x "invalid-character-sequence-after-doctype-name"
tokenDoctypeSetForceQuirks lexerToken
doBogusDoctype x
doAfterDoctypePublicKeyword :: Lexer s -> ST s ()
doAfterDoctypePublicKeyword x @ Lexer {..} = do
c <- nextWord x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace -> do
doBeforeDoctypePublicId x
| c == chrQuote -> do
parseError x "missing-whitespace-after-doctype-public-keyword"
tokenDoctypePublicIdInit lexerToken
doDoctypePublicIdDoubleQuoted x
| c == chrApostrophe -> do
parseError x "missing-whitespace-after-doctype-public-keyword"
tokenDoctypePublicIdInit lexerToken
doDoctypePublicIdSingleQuoted x
| c == chrGreater -> do
parseError x "missing-doctype-public-identifier"
tokenDoctypeSetForceQuirks lexerToken
state x StateData
emit x
| c == chrEOF -> do
parseError x "eof-in-doctype"
tokenDoctypeSetForceQuirks lexerToken
tokenEOFInit lexerToken
emit x
| otherwise -> do
parseError x "missing-quote-before-doctype-public-identifier"
tokenDoctypeSetForceQuirks lexerToken
doBogusDoctype x
doBeforeDoctypePublicId :: Lexer s -> ST s ()
doBeforeDoctypePublicId x @ Lexer {..} = do
c <- nextWord x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace -> do
doBeforeDoctypePublicId x
| c == chrQuote -> do
tokenDoctypePublicIdInit lexerToken
doDoctypePublicIdDoubleQuoted x
| c == chrApostrophe -> do
tokenDoctypePublicIdInit lexerToken
doDoctypePublicIdSingleQuoted x
| c == chrGreater -> do
parseError x "missing-doctype-public-identifier"
tokenDoctypeSetForceQuirks lexerToken
state x StateData
emit x
| c == chrEOF -> do
parseError x "eof-in-doctype"
tokenDoctypeSetForceQuirks lexerToken
tokenEOFInit lexerToken
emit x
| otherwise -> do
parseError x "missing-quote-before-doctype-public-identifier"
tokenDoctypeSetForceQuirks lexerToken
doBogusDoctype x
doDoctypePublicIdDoubleQuoted :: Lexer s -> ST s ()
doDoctypePublicIdDoubleQuoted x @ Lexer {..} = do
c <- nextWord x
if | c == chrQuote -> do
doAfterDoctypePublicId x
| c == chrGreater -> do
parseError x "abrupt-doctype-public-identifier"
tokenDoctypeSetForceQuirks lexerToken
state x StateData
emit x
| c == chrEOF -> do
parseError x "eof-in-doctype"
tokenDoctypeSetForceQuirks lexerToken
tokenEOFInit lexerToken
emit x
| otherwise -> do
tokenDoctypePublicIdAppend c lexerToken
doDoctypePublicIdDoubleQuoted x
doDoctypePublicIdSingleQuoted :: Lexer s -> ST s ()
doDoctypePublicIdSingleQuoted x @ Lexer {..} = do
c <- nextWord x
if | c == chrApostrophe -> do
doAfterDoctypePublicId x
| c == chrGreater -> do
parseError x "abrupt-doctype-public-identifier"
tokenDoctypeSetForceQuirks lexerToken
state x StateData
emit x
| c == chrEOF -> do
parseError x "eof-in-doctype"
tokenDoctypeSetForceQuirks lexerToken
tokenEOFInit lexerToken
emit x
| otherwise -> do
tokenDoctypePublicIdAppend c lexerToken
doDoctypePublicIdSingleQuoted x
doAfterDoctypePublicId :: Lexer s -> ST s ()
doAfterDoctypePublicId x @ Lexer {..} = do
c <- nextWord x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace -> do
doBetweenDoctypePublicAndSystem x
| c == chrGreater -> do
state x StateData
emit x
| c == chrQuote -> do
parseError x "missing-whitespace-between-doctype-public-and-system-identifiers"
tokenDoctypeSystemIdInit lexerToken
doDoctypeSystemIdDoubleQuoted x
| c == chrApostrophe -> do
parseError x "missing-whitespace-between-doctype-public-and-system-identifiers"
tokenDoctypeSystemIdInit lexerToken
doDoctypeSystemIdSingleQuoted x
| c == chrEOF -> do
parseError x "eof-in-doctype"
tokenDoctypeSetForceQuirks lexerToken
tokenEOFInit lexerToken
emit x
| otherwise -> do
parseError x "missing-quote-before-doctype-system-identifier"
tokenDoctypeSetForceQuirks lexerToken
doBogusDoctype x
doBetweenDoctypePublicAndSystem :: Lexer s -> ST s ()
doBetweenDoctypePublicAndSystem x @ Lexer {..} = do
c <- nextWord x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace -> do
doBetweenDoctypePublicAndSystem x
| c == chrGreater -> do
state x StateData
emit x
| c == chrQuote -> do
tokenDoctypeSystemIdInit lexerToken
doDoctypeSystemIdDoubleQuoted x
| c == chrApostrophe -> do
tokenDoctypeSystemIdInit lexerToken
doDoctypeSystemIdSingleQuoted x
| c == chrEOF -> do
parseError x "eof-in-doctype"
tokenDoctypeSetForceQuirks lexerToken
tokenEOFInit lexerToken
emit x
| otherwise -> do
parseError x "missing-quote-before-doctype-system-identifier"
tokenDoctypeSetForceQuirks lexerToken
doBogusDoctype x
doAfterDoctypeSystemKeyword :: Lexer s -> ST s ()
doAfterDoctypeSystemKeyword x @ Lexer {..} = do
c <- nextWord x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace -> do
doBeforeDoctypeSystemId x
| c == chrQuote -> do
parseError x "missing-whitespace-after-doctype-system-keyword"
tokenDoctypeSystemIdInit lexerToken
doDoctypeSystemIdDoubleQuoted x
| c == chrApostrophe -> do
parseError x "missing-whitespace-after-doctype-system-keyword"
tokenDoctypeSystemIdInit lexerToken
doDoctypeSystemIdSingleQuoted x
| c == chrGreater -> do
parseError x "missing-doctype-system-identifier"
tokenDoctypeSetForceQuirks lexerToken
state x StateData
emit x
| c == chrEOF -> do
parseError x "eof-in-doctype"
tokenDoctypeSetForceQuirks lexerToken
tokenEOFInit lexerToken
emit x
| otherwise -> do
parseError x "missing-quote-before-doctype-system-identifier"
tokenDoctypeSetForceQuirks lexerToken
doBogusDoctype x
doBeforeDoctypeSystemId :: Lexer s -> ST s ()
doBeforeDoctypeSystemId x @ Lexer {..} = do
c <- nextWord x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace -> do
doBeforeDoctypeSystemId x
| c == chrQuote -> do
tokenDoctypeSystemIdInit lexerToken
doDoctypeSystemIdDoubleQuoted x
| c == chrApostrophe -> do
tokenDoctypeSystemIdInit lexerToken
doDoctypeSystemIdSingleQuoted x
| c == chrGreater -> do
parseError x "missing-doctype-system-identifier"
tokenDoctypeSetForceQuirks lexerToken
state x StateData
emit x
| c == chrEOF -> do
parseError x "eof-in-doctype"
tokenDoctypeSetForceQuirks lexerToken
tokenEOFInit lexerToken
emit x
| otherwise -> do
parseError x "missing-quote-before-doctype-system-identifier"
tokenDoctypeSetForceQuirks lexerToken
doBogusDoctype x
doDoctypeSystemIdDoubleQuoted :: Lexer s -> ST s ()
doDoctypeSystemIdDoubleQuoted x @ Lexer {..} = do
c <- nextWord x
if | c == chrQuote -> do
doAfterDoctypeSystemId x
| c == chrGreater -> do
parseError x "abrupt-doctype-system-identifier"
tokenDoctypeSetForceQuirks lexerToken
state x StateData
emit x
| c == chrEOF -> do
parseError x "eof-in-doctype"
tokenDoctypeSetForceQuirks lexerToken
tokenEOFInit lexerToken
emit x
| otherwise -> do
tokenDoctypeSystemIdAppend c lexerToken
doDoctypeSystemIdDoubleQuoted x
doDoctypeSystemIdSingleQuoted :: Lexer s -> ST s ()
doDoctypeSystemIdSingleQuoted x @ Lexer {..} = do
c <- nextWord x
if | c == chrApostrophe -> do
doAfterDoctypeSystemId x
| c == chrGreater -> do
parseError x "abrupt-doctype-system-identifier"
tokenDoctypeSetForceQuirks lexerToken
state x StateData
emit x
| c == chrEOF -> do
parseError x "eof-in-doctype"
tokenDoctypeSetForceQuirks lexerToken
tokenEOFInit lexerToken
emit x
| otherwise -> do
tokenDoctypeSystemIdAppend c lexerToken
doDoctypeSystemIdSingleQuoted x
doAfterDoctypeSystemId :: Lexer s -> ST s ()
doAfterDoctypeSystemId x @ Lexer {..} = do
c <- nextWord x
if | c == chrTab ||
c == chrLF ||
c == chrFF ||
c == chrSpace -> do
doAfterDoctypeSystemId x
| c == chrGreater -> do
state x StateData
emit x
| c == chrEOF -> do
parseError x "eof-in-doctype"
tokenDoctypeSetForceQuirks lexerToken
tokenEOFInit lexerToken
emit x
| otherwise -> do
parseError x "unexpected-character-after-doctype-system-identifier"
doBogusDoctype x
doBogusDoctype :: Lexer s -> ST s ()
doBogusDoctype x @ Lexer {..} = do
c <- nextWord x
if | c == chrGreater -> do
state x StateData
emit x
| c == chrEOF -> do
tokenEOFInit lexerToken
emit x
| otherwise -> do
doBogusDoctype x
doCDATASection :: Lexer s -> ST s ()
doCDATASection x @ Lexer {..} = do
c <- nextWord x
if | c == chrBracketRight -> do
doCDATASectionBracket x
| c == chrEOF -> do
parseError x "eof-in-cdata"
tokenEOFInit lexerToken
emit x
| otherwise -> do
emitChar x c
doCDATASection x
doCDATASectionBracket :: Lexer s -> ST s ()
doCDATASectionBracket x @ Lexer {..} = do
c <- nextWord x
if | c == chrBracketRight -> do
doCDATASectionEnd x
| otherwise -> do
tokenCharInit chrBracketRight lexerToken
backWord x
doCDATASection x
doCDATASectionEnd :: Lexer s -> ST s ()
doCDATASectionEnd x @ Lexer {..} = do
c <- nextWord x
if | c == chrBracketRight -> do
emitChar x c
doCDATASectionEnd x
| c == chrGreater -> do
doData x
| otherwise -> do
emitChar x chrBracketRight
emitChar x chrBracketRight
backWord x
doCDATASection x
doCharacterReference :: Lexer s -> ST s ()
doCharacterReference x @ Lexer {..} = do
bufferReset lexerBuffer
bufferAppend chrAmpersand lexerBuffer
c <- nextWord x
if | lexerIgnore -> do
flushCodePoints x
backWord x
returnState x
| chrASCIIAlphanumeric c -> do
backWord x
doNamedCharacterReference x
| c == chrNumberSign -> do
bufferAppend c lexerBuffer
doNumericCharacterReference x
| otherwise -> do
flushCodePoints x
backWord x
returnState x
doNamedCharacterReference :: Lexer s -> ST s ()
doNamedCharacterReference x @ Lexer {..} = do
o <- rref lexerOffset
case entityMatch (bsDrop o lexerData) of
Just (p, v, _) -> do
skipWord x $ bsLen p
forM_ (bsUnpack p) $
flip bufferAppend lexerBuffer
attr <- consumingAttibute x
semi <- pure $ bsLast p == Just chrSemicolon
c <- peekWord x
if | attr
, not semi
, c == chrSemicolon || chrASCIIAlphanumeric c -> do
flushCodePoints x
returnState x
| otherwise -> do
when (not semi) $
parseError x "missing-semicolon-after-character-reference"
bufferReset lexerBuffer
forM_ (bsUnpack v) $
flip bufferAppend lexerBuffer
flushCodePoints x
returnState x
Nothing -> do
flushCodePoints x
doAmbiguousAmpersand x
doAmbiguousAmpersand :: Lexer s -> ST s ()
doAmbiguousAmpersand x @ Lexer {..} = do
c <- nextWord x
if | chrASCIIAlphanumeric c -> do
consumingAttibute x >>= \case
True -> tokenAttrValAppend c lexerToken
False -> emitChar x c
doAmbiguousAmpersand x
| c == chrSemicolon -> do
parseError x "unknown-named-character-reference"
backWord x
returnState x
| otherwise -> do
backWord x
returnState x
doNumericCharacterReference :: Lexer s -> ST s ()
doNumericCharacterReference x @ Lexer {..} = do
wref lexerCode 0
c <- nextWord x
if | c == chrUpperX || c == chrLowerX -> do
bufferAppend c lexerBuffer
doHexCharacterReferenceStart x
| otherwise -> do
backWord x
doDecimalCharacterReference x
doHexCharacterReferenceStart :: Lexer s -> ST s ()
doHexCharacterReferenceStart x @ Lexer {..} = do
c <- nextWord x
if | chrASCIIHexDigit c -> do
backWord x
doHexCharacterReference x
| otherwise -> do
parseError x "absence-of-digits-in-numeric-character-reference"
flushCodePoints x
backWord x
returnState x
doDecimalCharacterReferenceStart :: Lexer s -> ST s ()
doDecimalCharacterReferenceStart x @ Lexer {..} = do
c <- nextWord x
if | chrASCIIDigit c -> do
backWord x
doDecimalCharacterReference x
| otherwise -> do
parseError x "absence-of-digits-in-numeric-character-reference"
flushCodePoints x
backWord x
returnState x
doHexCharacterReference :: Lexer s -> ST s ()
doHexCharacterReference x @ Lexer {..} = do
c <- nextWord x
if | chrASCIIDigit c -> do
uref lexerCode $ \y -> 16 * y + (fromIntegral c - 0x30)
doHexCharacterReference x
| chrASCIIUpperHexDigit c -> do
uref lexerCode $ \y -> 16 * y + (fromIntegral c - 0x37)
doHexCharacterReference x
| chrASCIILowerHexDigit c -> do
uref lexerCode $ \y -> 16 * y + (fromIntegral c - 0x57)
doHexCharacterReference x
| c == chrSemicolon -> do
doNumericCharacterReferenceEnd x
| otherwise -> do
parseError x "missing-semicolon-after-character-reference"
backWord x
doNumericCharacterReferenceEnd x
doDecimalCharacterReference :: Lexer s -> ST s ()
doDecimalCharacterReference x @ Lexer {..} = do
c <- nextWord x
if | chrASCIIDigit c -> do
uref lexerCode $ \y -> 10 * y + (fromIntegral c - 0x30)
doDecimalCharacterReference x
| c == chrSemicolon -> do
doNumericCharacterReferenceEnd x
| otherwise -> do
parseError x "missing-semicolon-after-character-reference"
backWord x
doNumericCharacterReferenceEnd x
doNumericCharacterReferenceEnd :: Lexer s -> ST s ()
doNumericCharacterReferenceEnd x @ Lexer {..} = do
c <- rref lexerCode
let n = fromIntegral c
if | c == 0 -> do
parseError x "null-character-reference"
wref lexerCode 0xFFFD
| c > 0x10FFFF -> do
parseError x "character-reference-outside-unicode-range"
wref lexerCode 0xFFFD
| chrSurrogate c -> do
parseError x "surrogate-character-reference"
wref lexerCode 0xFFFD
| chrNonCharacter c -> do
parseError x "noncharacter-character-reference"
| c == 0x0D || (chrWord8 c && chrControl n && not (chrWhitespace n)) -> do
parseError x "control-character-reference"
whenJust (Map.lookup c codeMap) $ wref lexerCode
| otherwise ->
pure ()
bufferReset lexerBuffer
forM_ (chrUTF8 c) $ flip bufferAppend lexerBuffer
flushCodePoints x
returnState x
codeMap :: Map Int Int
codeMap = Map.fromList
[ (0x80, 0x20AC)
, (0x82, 0x201A)
, (0x83, 0x0192)
, (0x84, 0x201E)
, (0x85, 0x2026)
, (0x86, 0x2020)
, (0x87, 0x2021)
, (0x88, 0x02C6)
, (0x89, 0x2030)
, (0x8A, 0x0160)
, (0x8B, 0x2039)
, (0x8C, 0x0152)
, (0x8E, 0x017D)
, (0x91, 0x2018)
, (0x92, 0x2019)
, (0x93, 0x201C)
, (0x94, 0x201D)
, (0x95, 0x2022)
, (0x96, 0x2013)
, (0x97, 0x2014)
, (0x98, 0x02DC)
, (0x99, 0x2122)
, (0x9A, 0x0161)
, (0x9B, 0x203A)
, (0x9C, 0x0153)
, (0x9E, 0x017E)
, (0x9F, 0x0178)
]