{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE LambdaCase #-}

{-# OPTIONS_GHC -O2 #-}

-- | This is a performance-oriented HTML tokenizer aim at web-crawling
-- applications. It follows the HTML5 parsing specification quite closely,
-- so it behaves reasonable well on ill-formed documents from the open Web.
--
module Text.HTML.Parser
    ( -- * Parsing
      parseTokens
    , parseTokensLazy
    , token
      -- * Types
    , Token(..)
    , TagName, AttrName, AttrValue
    , Attr(..)
      -- * Rendering, text canonicalization
    , renderTokens
    , renderToken
    , renderAttrs
    , renderAttr
    , canonicalizeTokens
    ) where

import Data.Char hiding (isSpace)
import Data.List (unfoldr)
import GHC.Generics
import Control.Applicative
import Data.Monoid
import Control.Monad (guard)
import Control.DeepSeq

import Data.Attoparsec.Text
import qualified Data.Attoparsec.Text.Lazy as AL
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import Data.Text.Lazy.Builder (Builder)
import qualified Data.Text.Lazy.Builder as B
import Prelude hiding (take, takeWhile)

import Text.HTML.Parser.Entities (entities)
import qualified Data.Trie as Trie

-- Section numbers refer to W3C HTML 5.2 specification.

-- | A tag name (e.g. @body@)
type TagName   = Text

-- | An attribute name (e.g. @href@)
type AttrName  = Text

-- | The value of an attribute
type AttrValue = Text

-- | An HTML token
data Token
  -- | An opening tag. Attribute ordering is arbitrary. Void elements have a 'TagOpen' but no corresponding 'TagClose'. See 'Text.HTML.Tree.nonClosing'.
  = TagOpen !TagName [Attr]
  -- | A self-closing tag.
  | TagSelfClose !TagName [Attr]
  -- | A closing tag.
  | TagClose !TagName
  -- | The content between tags.
  | ContentText !Text
  -- | A single character of content
  | ContentChar !Char
  -- | Contents of a comment.
  | Comment !Builder
  -- | Doctype
  | Doctype !Text
  deriving (Int -> Token -> ShowS
[Token] -> ShowS
Token -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Token] -> ShowS
$cshowList :: [Token] -> ShowS
show :: Token -> String
$cshow :: Token -> String
showsPrec :: Int -> Token -> ShowS
$cshowsPrec :: Int -> Token -> ShowS
Show, Eq Token
Token -> Token -> Bool
Token -> Token -> Ordering
Token -> Token -> Token
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: Token -> Token -> Token
$cmin :: Token -> Token -> Token
max :: Token -> Token -> Token
$cmax :: Token -> Token -> Token
>= :: Token -> Token -> Bool
$c>= :: Token -> Token -> Bool
> :: Token -> Token -> Bool
$c> :: Token -> Token -> Bool
<= :: Token -> Token -> Bool
$c<= :: Token -> Token -> Bool
< :: Token -> Token -> Bool
$c< :: Token -> Token -> Bool
compare :: Token -> Token -> Ordering
$ccompare :: Token -> Token -> Ordering
Ord, Token -> Token -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Token -> Token -> Bool
$c/= :: Token -> Token -> Bool
== :: Token -> Token -> Bool
$c== :: Token -> Token -> Bool
Eq, forall x. Rep Token x -> Token
forall x. Token -> Rep Token x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep Token x -> Token
$cfrom :: forall x. Token -> Rep Token x
Generic)

-- | This is a bit of a hack
endOfFileToken :: Token
endOfFileToken :: Token
endOfFileToken = Text -> Token
ContentText Text
""

-- | An attribute of a tag
data Attr = Attr !AttrName !AttrValue
          deriving (Int -> Attr -> ShowS
[Attr] -> ShowS
Attr -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Attr] -> ShowS
$cshowList :: [Attr] -> ShowS
show :: Attr -> String
$cshow :: Attr -> String
showsPrec :: Int -> Attr -> ShowS
$cshowsPrec :: Int -> Attr -> ShowS
Show, Attr -> Attr -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Attr -> Attr -> Bool
$c/= :: Attr -> Attr -> Bool
== :: Attr -> Attr -> Bool
$c== :: Attr -> Attr -> Bool
Eq, Eq Attr
Attr -> Attr -> Bool
Attr -> Attr -> Ordering
Attr -> Attr -> Attr
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: Attr -> Attr -> Attr
$cmin :: Attr -> Attr -> Attr
max :: Attr -> Attr -> Attr
$cmax :: Attr -> Attr -> Attr
>= :: Attr -> Attr -> Bool
$c>= :: Attr -> Attr -> Bool
> :: Attr -> Attr -> Bool
$c> :: Attr -> Attr -> Bool
<= :: Attr -> Attr -> Bool
$c<= :: Attr -> Attr -> Bool
< :: Attr -> Attr -> Bool
$c< :: Attr -> Attr -> Bool
compare :: Attr -> Attr -> Ordering
$ccompare :: Attr -> Attr -> Ordering
Ord)

instance NFData Token where
    rnf :: Token -> ()
rnf (Comment Builder
b) = forall a. NFData a => a -> ()
rnf forall a b. (a -> b) -> a -> b
$ Builder -> Text
B.toLazyText Builder
b
    rnf Token
_           = ()

-- | Parse a single 'Token'.
token :: Parser Token
token :: Parser Token
token = Parser Token
dataState -- Start in the data state.

-- | /§8.2.4.1/: Data state
dataState :: Parser Token
dataState :: Parser Token
dataState = do
    Text
content <- (Char -> Bool) -> Parser Text
takeWhile (\Char
c -> Char
c forall a. Eq a => a -> a -> Bool
/= Char
'<' Bool -> Bool -> Bool
&& Char
c forall a. Eq a => a -> a -> Bool
/= Char
'&')
    if Bool -> Bool
not forall a b. (a -> b) -> a -> b
$ Text -> Bool
T.null Text
content
      then forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ Text -> Token
ContentText Text
content
      else forall (f :: * -> *) a. Alternative f => [f a] -> f a
choice
        [ Char -> Parser Char
char Char
'<' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Token
tagOpen
        , forall i a. Parser i a -> Parser i a
try forall a b. (a -> b) -> a -> b
$ Char -> Parser Char
char Char
'&' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Token
charRef
        , Char -> Token
ContentChar Char
'&' forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ Char -> Parser Char
char Char
'&'
        ]

charRef :: Parser Token
charRef :: Parser Token
charRef = Trie Char Token -> Parser Token
go Trie Char Token
entityTrie
  where
    go :: Trie.Trie Char Token -> Parser Token
    go :: Trie Char Token -> Parser Token
go Trie Char Token
trie = do
      Char
c <- Parser Char
anyChar
      case Char
c of
        Char
';' -> forall b a. b -> (a -> b) -> Maybe a -> b
maybe forall (f :: * -> *) a. Alternative f => f a
empty forall (m :: * -> *) a. Monad m => a -> m a
return (forall k v. Trie k v -> Maybe v
Trie.terminal Trie Char Token
trie)
        Char
_ -> Trie Char Token -> Parser Token
go (forall k v. Ord k => k -> Trie k v -> Trie k v
Trie.step Char
c Trie Char Token
trie)

entityTrie :: Trie.Trie Char Token
entityTrie :: Trie Char Token
entityTrie = forall k v. Ord k => [([k], v)] -> Trie k v
Trie.fromList
    [ (Text -> String
T.unpack Text
name, Text -> Token
ContentText Text
expansion)
    | (Text
name, Text
expansion) <- [(Text, Text)]
entities
    ]

-- | /§8.2.4.6/: Tag open state
tagOpen :: Parser Token
tagOpen :: Parser Token
tagOpen =
        (Char -> Parser Char
char Char
'!' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Token
markupDeclOpen)
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'/' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Token
endTagOpen)
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'?' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
bogusComment forall a. Monoid a => a
mempty)
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Token
tagNameOpen
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Token
other
  where
    other :: Parser Token
other = do
        forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ Char -> Token
ContentChar Char
'<'

-- | /§8.2.4.7/: End tag open state
endTagOpen :: Parser Token
endTagOpen :: Parser Token
endTagOpen = Parser Token
tagNameClose

-- | Equivalent to @inClass "\x09\x0a\x0c "@
isWhitespace :: Char -> Bool
isWhitespace :: Char -> Bool
isWhitespace Char
'\x09' = Bool
True
isWhitespace Char
'\x0a' = Bool
True
isWhitespace Char
'\x0c' = Bool
True
isWhitespace Char
'\x0d' = Bool
True
isWhitespace Char
' '    = Bool
True
isWhitespace Char
_      = Bool
False

orC :: (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
orC :: (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
orC Char -> Bool
f Char -> Bool
g Char
c = Char -> Bool
f Char
c Bool -> Bool -> Bool
|| Char -> Bool
g Char
c
{-# INLINE orC #-}

isC :: Char -> Char -> Bool
isC :: Char -> Char -> Bool
isC = forall a. Eq a => a -> a -> Bool
(==)
{-# INLINE isC #-}

-- | /§8.2.4.8/: Tag name state: the open case
--
-- deviation: no lower-casing, don't handle NULL characters
tagNameOpen :: Parser Token
tagNameOpen :: Parser Token
tagNameOpen = do
    Text
tag <- Parser Text
tagName'
    forall a. a -> a
id forall a b. (a -> b) -> a -> b
$  ((Char -> Bool) -> Parser Char
satisfy Char -> Bool
isWhitespace forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Parser Token
beforeAttrName Text
tag [])
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'/' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Parser Token
selfClosingStartTag Text
tag [])
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'>' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [Attr] -> Token
TagOpen Text
tag []))

-- | /§8.2.4.10/: Tag name state: close case
tagNameClose :: Parser Token
tagNameClose :: Parser Token
tagNameClose = do
    Text
tag <- Parser Text
tagName'
    Char -> Parser Char
char Char
'>' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> Token
TagClose Text
tag)

-- | /§8.2.4.10/: Tag name state: common code
--
-- deviation: no lower-casing, don't handle NULL characters
tagName' :: Parser Text
tagName' :: Parser Text
tagName' = do
    Char
c <- Parser Char
peekChar'
    forall (f :: * -> *). Alternative f => Bool -> f ()
guard forall a b. (a -> b) -> a -> b
$ Char -> Bool
isAsciiUpper Char
c Bool -> Bool -> Bool
|| Char -> Bool
isAsciiLower Char
c
    (Char -> Bool) -> Parser Text
takeWhile forall a b. (a -> b) -> a -> b
$ Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool
isWhitespace (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
`orC` Char -> Char -> Bool
isC Char
'/' (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
`orC` Char -> Char -> Bool
isC Char
'<' (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
`orC` Char -> Char -> Bool
isC Char
'>')

-- | /§8.2.4.40/: Self-closing start tag state
selfClosingStartTag :: TagName -> [Attr] -> Parser Token
selfClosingStartTag :: Text -> [Attr] -> Parser Token
selfClosingStartTag Text
tag [Attr]
attrs = do
        (Char -> Parser Char
char Char
'>' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [Attr] -> Token
TagSelfClose Text
tag [Attr]
attrs))
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (forall t. Chunk t => Parser t ()
endOfInput forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return Token
endOfFileToken)
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Text -> [Attr] -> Parser Token
beforeAttrName Text
tag [Attr]
attrs

-- | /§8.2.4.32/: Before attribute name state
--
-- deviation: no lower-casing
beforeAttrName :: TagName -> [Attr] -> Parser Token
beforeAttrName :: Text -> [Attr] -> Parser Token
beforeAttrName Text
tag [Attr]
attrs = do
    (Char -> Bool) -> Parser Text ()
skipWhile Char -> Bool
isWhitespace
    forall a. a -> a
id forall a b. (a -> b) -> a -> b
$  (Char -> Parser Char
char Char
'/' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Parser Token
selfClosingStartTag Text
tag [Attr]
attrs)
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'>' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [Attr] -> Token
TagOpen Text
tag [Attr]
attrs))
      -- <|> (char '\x00' >> attrName tag attrs) -- TODO: NULL
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Text -> [Attr] -> Parser Token
attrName Text
tag [Attr]
attrs

-- | /§8.2.4.33/: Attribute name state
attrName :: TagName -> [Attr] -> Parser Token
attrName :: Text -> [Attr] -> Parser Token
attrName Text
tag [Attr]
attrs = do
    Text
name <- (Char -> Bool) -> Parser Text
takeWhile forall a b. (a -> b) -> a -> b
$ Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool
isWhitespace (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
`orC` Char -> Char -> Bool
isC Char
'/' (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
`orC` Char -> Char -> Bool
isC Char
'=' (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
`orC` Char -> Char -> Bool
isC Char
'>')
    forall a. a -> a
id forall a b. (a -> b) -> a -> b
$  (forall t. Chunk t => Parser t ()
endOfInput forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Text -> Parser Token
afterAttrName Text
tag [Attr]
attrs Text
name)
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'=' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Text -> Parser Token
beforeAttrValue Text
tag [Attr]
attrs Text
name)
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> forall i a. Parser i a -> Parser i a
try (do Maybe Char
mc <- Parser (Maybe Char)
peekChar
                  case Maybe Char
mc of
                    Just Char
c | Char -> Bool
notNameChar Char
c ->  Text -> [Attr] -> Text -> Parser Token
afterAttrName Text
tag [Attr]
attrs Text
name
                    Maybe Char
_ -> forall (f :: * -> *) a. Alternative f => f a
empty)
      -- <|> -- TODO: NULL
  where notNameChar :: Char -> Bool
notNameChar = Char -> Bool
isWhitespace (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
`orC` Char -> Char -> Bool
isC Char
'/' (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
`orC` Char -> Char -> Bool
isC Char
'>'

-- | /§8.2.4.34/: After attribute name state
afterAttrName :: TagName -> [Attr] -> AttrName -> Parser Token
afterAttrName :: Text -> [Attr] -> Text -> Parser Token
afterAttrName Text
tag [Attr]
attrs Text
name = do
    (Char -> Bool) -> Parser Text ()
skipWhile Char -> Bool
isWhitespace
    forall a. a -> a
id forall a b. (a -> b) -> a -> b
$  (Char -> Parser Char
char Char
'/' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Parser Token
selfClosingStartTag Text
tag [Attr]
attrs)
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'=' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Text -> Parser Token
beforeAttrValue Text
tag [Attr]
attrs Text
name)
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'>' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [Attr] -> Token
TagOpen Text
tag (Text -> Text -> Attr
Attr Text
name Text
T.empty forall a. a -> [a] -> [a]
: [Attr]
attrs)))
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (forall t. Chunk t => Parser t ()
endOfInput forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return Token
endOfFileToken)
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Text -> [Attr] -> Parser Token
attrName Text
tag (Text -> Text -> Attr
Attr Text
name Text
T.empty forall a. a -> [a] -> [a]
: [Attr]
attrs)  -- not exactly sure this is right

-- | /§8.2.4.35/: Before attribute value state
beforeAttrValue :: TagName -> [Attr] -> AttrName -> Parser Token
beforeAttrValue :: Text -> [Attr] -> Text -> Parser Token
beforeAttrValue Text
tag [Attr]
attrs Text
name = do
    (Char -> Bool) -> Parser Text ()
skipWhile Char -> Bool
isWhitespace
    forall a. a -> a
id forall a b. (a -> b) -> a -> b
$  (Char -> Parser Char
char Char
'"' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Text -> Parser Token
attrValueDQuoted Text
tag [Attr]
attrs Text
name)
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'\'' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Text -> Parser Token
attrValueSQuoted Text
tag [Attr]
attrs Text
name)
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'>' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [Attr] -> Token
TagOpen Text
tag (Text -> Text -> Attr
Attr Text
name Text
T.empty forall a. a -> [a] -> [a]
: [Attr]
attrs)))
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Text -> [Attr] -> Text -> Parser Token
attrValueUnquoted Text
tag [Attr]
attrs Text
name

-- | /§8.2.4.36/: Attribute value (double-quoted) state
attrValueDQuoted :: TagName -> [Attr] -> AttrName -> Parser Token
attrValueDQuoted :: Text -> [Attr] -> Text -> Parser Token
attrValueDQuoted Text
tag [Attr]
attrs Text
name = do
    Text
value <- (Char -> Bool) -> Parser Text
takeWhile (forall a. Eq a => a -> a -> Bool
/= Char
'"')
    Char
_ <- Char -> Parser Char
char Char
'"'
    Text -> [Attr] -> Text -> Text -> Parser Token
afterAttrValueQuoted Text
tag [Attr]
attrs Text
name Text
value

-- | /§8.2.4.37/: Attribute value (single-quoted) state
attrValueSQuoted :: TagName -> [Attr] -> AttrName -> Parser Token
attrValueSQuoted :: Text -> [Attr] -> Text -> Parser Token
attrValueSQuoted Text
tag [Attr]
attrs Text
name = do
    Text
value <- (Char -> Bool) -> Parser Text
takeWhile (forall a. Eq a => a -> a -> Bool
/= Char
'\'')
    Char
_ <- Char -> Parser Char
char Char
'\''
    Text -> [Attr] -> Text -> Text -> Parser Token
afterAttrValueQuoted Text
tag [Attr]
attrs Text
name Text
value

-- | /§8.2.4.38/: Attribute value (unquoted) state
attrValueUnquoted :: TagName -> [Attr] -> AttrName -> Parser Token
attrValueUnquoted :: Text -> [Attr] -> Text -> Parser Token
attrValueUnquoted Text
tag [Attr]
attrs Text
name = do
    Text
value <- (Char -> Bool) -> Parser Text
takeTill forall a b. (a -> b) -> a -> b
$ Char -> Bool
isWhitespace (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
`orC` Char -> Char -> Bool
isC Char
'>'
    forall a. a -> a
id forall a b. (a -> b) -> a -> b
$  ((Char -> Bool) -> Parser Char
satisfy Char -> Bool
isWhitespace forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Parser Token
beforeAttrName Text
tag [Attr]
attrs) -- unsure: don't emit?
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'>' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [Attr] -> Token
TagOpen Text
tag (Text -> Text -> Attr
Attr Text
name Text
value forall a. a -> [a] -> [a]
: [Attr]
attrs)))
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (forall t. Chunk t => Parser t ()
endOfInput forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return Token
endOfFileToken)

-- | /§8.2.4.39/: After attribute value (quoted) state
afterAttrValueQuoted :: TagName -> [Attr] -> AttrName -> AttrValue -> Parser Token
afterAttrValueQuoted :: Text -> [Attr] -> Text -> Text -> Parser Token
afterAttrValueQuoted Text
tag [Attr]
attrs Text
name Text
value =
          ((Char -> Bool) -> Parser Char
satisfy Char -> Bool
isWhitespace forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Parser Token
beforeAttrName Text
tag [Attr]
attrs')
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'/' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Parser Token
selfClosingStartTag Text
tag [Attr]
attrs')
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'>' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [Attr] -> Token
TagOpen Text
tag [Attr]
attrs'))
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (forall t. Chunk t => Parser t ()
endOfInput forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return Token
endOfFileToken)
  where attrs' :: [Attr]
attrs' = Text -> Text -> Attr
Attr Text
name Text
value forall a. a -> [a] -> [a]
: [Attr]
attrs

-- | /§8.2.4.41/: Bogus comment state
bogusComment :: Builder -> Parser Token
bogusComment :: Builder -> Parser Token
bogusComment Builder
content = do
        (Char -> Parser Char
char Char
'>' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return (Builder -> Token
Comment Builder
content))
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (forall t. Chunk t => Parser t ()
endOfInput forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return (Builder -> Token
Comment Builder
content))
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'\x00' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
bogusComment (Builder
content forall a. Semigroup a => a -> a -> a
<> Builder
"\xfffd"))
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Parser Char
anyChar forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \Char
c -> Builder -> Parser Token
bogusComment (Builder
content forall a. Semigroup a => a -> a -> a
<> Char -> Builder
B.singleton Char
c))

-- | /§8.2.4.42/: Markup declaration open state
markupDeclOpen :: Parser Token
markupDeclOpen :: Parser Token
markupDeclOpen =
        forall i a. Parser i a -> Parser i a
try Parser Token
comment_
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> forall i a. Parser i a -> Parser i a
try Parser Token
docType
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Builder -> Parser Token
bogusComment forall a. Monoid a => a
mempty
  where
    comment_ :: Parser Token
comment_ = Char -> Parser Char
char Char
'-' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Char -> Parser Char
char Char
'-' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Token
commentStart
    docType :: Parser Token
docType = do
        -- switching this to asciiCI slowed things down by a factor of two
        Text
s <- Int -> Parser Text
take Int
7
        forall (f :: * -> *). Alternative f => Bool -> f ()
guard forall a b. (a -> b) -> a -> b
$ Text -> Text
T.toLower Text
s forall a. Eq a => a -> a -> Bool
== Text
"doctype"
        Parser Token
doctype

-- | /§8.2.4.43/: Comment start state
commentStart :: Parser Token
commentStart :: Parser Token
commentStart = do
          (Char -> Parser Char
char Char
'-' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Token
commentStartDash)
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'>' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return (Builder -> Token
Comment forall a. Monoid a => a
mempty))
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Builder -> Parser Token
comment forall a. Monoid a => a
mempty

-- | /§8.2.4.44/: Comment start dash state
commentStartDash :: Parser Token
commentStartDash :: Parser Token
commentStartDash =
          (Char -> Parser Char
char Char
'-' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
commentEnd forall a. Monoid a => a
mempty)
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'>' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return (Builder -> Token
Comment forall a. Monoid a => a
mempty))
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (forall t. Chunk t => Parser t ()
endOfInput forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return (Builder -> Token
Comment forall a. Monoid a => a
mempty))
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Builder -> Parser Token
comment (Char -> Builder
B.singleton Char
'-'))

-- | /§8.2.4.45/: Comment state
comment :: Builder -> Parser Token
comment :: Builder -> Parser Token
comment Builder
content0 = do
    Builder
content <- Text -> Builder
B.fromText forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Char -> Bool) -> Parser Text
takeWhile (Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Char -> Bool
isC Char
'-' (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
`orC` Char -> Char -> Bool
isC Char
'\x00' (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
`orC` Char -> Char -> Bool
isC Char
'<'))
    forall a. a -> a
id forall a b. (a -> b) -> a -> b
$  (Char -> Parser Char
char Char
'<' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
commentLessThan (Builder
content0 forall a. Semigroup a => a -> a -> a
<> Builder
content forall a. Semigroup a => a -> a -> a
<> Builder
"<"))
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'-' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
commentEndDash (Builder
content0 forall a. Semigroup a => a -> a -> a
<> Builder
content))
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'\x00' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
comment (Builder
content0 forall a. Semigroup a => a -> a -> a
<> Builder
content forall a. Semigroup a => a -> a -> a
<> Char -> Builder
B.singleton Char
'\xfffd'))
      forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (forall t. Chunk t => Parser t ()
endOfInput forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return (Builder -> Token
Comment forall a b. (a -> b) -> a -> b
$ Builder
content0 forall a. Semigroup a => a -> a -> a
<> Builder
content))

-- | /§8.2.46/: Comment less-than sign state
commentLessThan :: Builder -> Parser Token
commentLessThan :: Builder -> Parser Token
commentLessThan Builder
content =
        (Char -> Parser Char
char Char
'!' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
commentLessThanBang (Builder
content forall a. Semigroup a => a -> a -> a
<> Builder
"!"))
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'<' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
commentLessThan (Builder
content forall a. Semigroup a => a -> a -> a
<> Builder
"<"))
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Builder -> Parser Token
comment Builder
content

-- | /§8.2.47/: Comment less-than sign bang state
commentLessThanBang :: Builder -> Parser Token
commentLessThanBang :: Builder -> Parser Token
commentLessThanBang Builder
content =
        (Char -> Parser Char
char Char
'-' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
commentLessThanBangDash Builder
content)
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Builder -> Parser Token
comment Builder
content

-- | /§8.2.48/: Comment less-than sign bang dash state
commentLessThanBangDash :: Builder -> Parser Token
commentLessThanBangDash :: Builder -> Parser Token
commentLessThanBangDash Builder
content =
        (Char -> Parser Char
char Char
'-' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
commentLessThanBangDashDash Builder
content)
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Builder -> Parser Token
commentEndDash Builder
content

-- | /§8.2.49/: Comment less-than sign bang dash dash state
commentLessThanBangDashDash :: Builder -> Parser Token
commentLessThanBangDashDash :: Builder -> Parser Token
commentLessThanBangDashDash Builder
content =
        (Char -> Parser Char
char Char
'>' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
comment Builder
content)
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (forall t. Chunk t => Parser t ()
endOfInput forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
comment Builder
content)
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Builder -> Parser Token
commentEnd Builder
content

-- | /§8.2.4.50/: Comment end dash state
commentEndDash :: Builder -> Parser Token
commentEndDash :: Builder -> Parser Token
commentEndDash Builder
content = do
        (Char -> Parser Char
char Char
'-' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
commentEnd Builder
content)
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (forall t. Chunk t => Parser t ()
endOfInput forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return (Builder -> Token
Comment Builder
content))
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Builder -> Parser Token
comment (Builder
content forall a. Semigroup a => a -> a -> a
<> Builder
"-"))

-- | /§8.2.4.51/: Comment end state
commentEnd :: Builder -> Parser Token
commentEnd :: Builder -> Parser Token
commentEnd Builder
content = do
        (Char -> Parser Char
char Char
'>' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return (Builder -> Token
Comment Builder
content))
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'!' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
commentEndBang Builder
content)
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'-' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
commentEnd (Builder
content forall a. Semigroup a => a -> a -> a
<> Builder
"-"))
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (forall t. Chunk t => Parser t ()
endOfInput forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return (Builder -> Token
Comment Builder
content))
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Builder -> Parser Token
comment (Builder
content forall a. Semigroup a => a -> a -> a
<> Builder
"--"))

-- | /§8.2.4.52/: Comment end bang state
commentEndBang :: Builder -> Parser Token
commentEndBang :: Builder -> Parser Token
commentEndBang Builder
content = do
        (Char -> Parser Char
char Char
'-' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
commentEndDash (Builder
content forall a. Semigroup a => a -> a -> a
<> Builder
"--!"))
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'>' forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return (Builder -> Token
Comment Builder
content))
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (forall t. Chunk t => Parser t ()
endOfInput forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return (Builder -> Token
Comment Builder
content))
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Builder -> Parser Token
comment (Builder
content forall a. Semigroup a => a -> a -> a
<> Builder
"--!"))

-- | /§8.2.4.53/: DOCTYPE state
-- FIXME
doctype :: Parser Token
doctype :: Parser Token
doctype = do
    Text
content <- (Char -> Bool) -> Parser Text
takeTill (forall a. Eq a => a -> a -> Bool
==Char
'>')
    Char
_ <- Char -> Parser Char
char Char
'>'
    forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ Text -> Token
Doctype Text
content

-- | Parse a lazy list of tokens from strict 'Text'.
parseTokens :: Text -> [Token]
parseTokens :: Text -> [Token]
parseTokens = forall b a. (b -> Maybe (a, b)) -> b -> [a]
unfoldr Text -> Maybe (Token, Text)
f
  where
    f :: Text -> Maybe (Token, Text)
    f :: Text -> Maybe (Token, Text)
f Text
t
      | Text -> Bool
T.null Text
t = forall a. Maybe a
Nothing
      | Bool
otherwise =
        case forall a. Parser a -> Text -> Result a
parse Parser Token
token Text
t of
            Done Text
rest Token
tok -> forall a. a -> Maybe a
Just (Token
tok, Text
rest)
            Partial Text -> IResult Text Token
cont  ->
                case Text -> IResult Text Token
cont forall a. Monoid a => a
mempty of
                  Done Text
rest Token
tok -> forall a. a -> Maybe a
Just (Token
tok, Text
rest)
                  IResult Text Token
_             -> forall a. Maybe a
Nothing
            IResult Text Token
_             -> forall a. Maybe a
Nothing

-- | Parse a lazy list of tokens from lazy 'TL.Text'.
parseTokensLazy :: TL.Text -> [Token]
parseTokensLazy :: Text -> [Token]
parseTokensLazy = forall b a. (b -> Maybe (a, b)) -> b -> [a]
unfoldr Text -> Maybe (Token, Text)
f
  where
    f :: TL.Text -> Maybe (Token, TL.Text)
    f :: Text -> Maybe (Token, Text)
f Text
t
      | Text -> Bool
TL.null Text
t = forall a. Maybe a
Nothing
      | Bool
otherwise =
        case forall a. Parser a -> Text -> Result a
AL.parse Parser Token
token Text
t of
            AL.Done Text
rest Token
tok -> forall a. a -> Maybe a
Just (Token
tok, Text
rest)
            Result Token
_                -> forall a. Maybe a
Nothing

-- | See 'renderToken'.
renderTokens :: [Token] -> TL.Text
renderTokens :: [Token] -> Text
renderTokens = forall a. Monoid a => [a] -> a
mconcat forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Token -> Text
renderToken

-- | (Somewhat) canonical string representation of 'Token'.
renderToken :: Token -> TL.Text
renderToken :: Token -> Text
renderToken = Text -> Text
TL.fromStrict forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Monoid a => [a] -> a
mconcat forall b c a. (b -> c) -> (a -> b) -> a -> c
. \case
    (TagOpen Text
n [])         -> [Text
"<", Text
n, Text
">"]
    (TagOpen Text
n [Attr]
attrs)      -> [Text
"<", Text
n, Text
" ", [Attr] -> Text
renderAttrs [Attr]
attrs, Text
">"]
    (TagSelfClose Text
n [Attr]
attrs) -> [Text
"<", Text
n, Text
" ", [Attr] -> Text
renderAttrs [Attr]
attrs, Text
" />"]
    (TagClose Text
n)           -> [Text
"</", Text
n, Text
">"]
    (ContentChar Char
c)        -> [Char -> Text
T.singleton Char
c]
    (ContentText Text
t)        -> [Text
t]
    (Comment Builder
builder)      -> [Text
"<!--", Text -> Text
TL.toStrict forall a b. (a -> b) -> a -> b
$ Builder -> Text
B.toLazyText Builder
builder, Text
"-->"]
    (Doctype Text
t)            -> [Text
"<!DOCTYPE", Text
t, Text
">"]

-- | See 'renderAttr'.
renderAttrs :: [Attr] -> Text
renderAttrs :: [Attr] -> Text
renderAttrs = [Text] -> Text
T.unwords forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Attr -> Text
renderAttr forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. [a] -> [a]
reverse

-- | Does not escape quotation in attribute values!
renderAttr :: Attr -> Text
renderAttr :: Attr -> Text
renderAttr (Attr Text
k Text
v) = forall a. Monoid a => [a] -> a
mconcat [Text
k, Text
"=\"", Text
v, Text
"\""]

-- | Meld neighoring 'ContentChar' and 'ContentText'
-- constructors together and drops empty text elements.
canonicalizeTokens :: [Token] -> [Token]
canonicalizeTokens :: [Token] -> [Token]
canonicalizeTokens = forall a. (a -> Bool) -> [a] -> [a]
filter (forall a. Eq a => a -> a -> Bool
/= Text -> Token
ContentText Text
"") forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Token] -> [Token]
meldTextTokens

meldTextTokens :: [Token] -> [Token]
meldTextTokens :: [Token] -> [Token]
meldTextTokens = [Token] -> [Token]
concatTexts forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Token -> Token
charToText
  where
    charToText :: Token -> Token
charToText (ContentChar Char
c) = Text -> Token
ContentText (Char -> Text
T.singleton Char
c)
    charToText Token
t = Token
t

    concatTexts :: [Token] -> [Token]
concatTexts = \case
      (ContentText Text
t : ContentText Text
t' : [Token]
ts) -> [Token] -> [Token]
concatTexts forall a b. (a -> b) -> a -> b
$ Text -> Token
ContentText (Text
t forall a. Semigroup a => a -> a -> a
<> Text
t') forall a. a -> [a] -> [a]
: [Token]
ts
      (Token
t : [Token]
ts) -> Token
t forall a. a -> [a] -> [a]
: [Token] -> [Token]
concatTexts [Token]
ts
      [] -> []