{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE LambdaCase #-}
{-# OPTIONS_GHC -O2 #-}
module Text.HTML.Parser
(
parseTokens
, parseTokensLazy
, token
, Token(..)
, TagName, AttrName, AttrValue
, Attr(..)
, renderTokens
, renderToken
, renderAttrs
, renderAttr
, canonicalizeTokens
) where
import Data.Char hiding (isSpace)
import Data.List (unfoldr)
import GHC.Generics
import Control.Applicative
import Data.Monoid
import Control.Monad (guard)
import Control.DeepSeq
import Data.Attoparsec.Text
import qualified Data.Attoparsec.Text.Lazy as AL
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import Data.Text.Lazy.Builder (Builder)
import qualified Data.Text.Lazy.Builder as B
import Prelude hiding (take, takeWhile)
type TagName = Text
type AttrName = Text
type AttrValue = Text
data Token
= TagOpen !TagName [Attr]
| TagSelfClose !TagName [Attr]
| TagClose !TagName
| ContentText !Text
| ContentChar !Char
| Comment !Builder
| Doctype !Text
deriving (Show, Ord, Eq, Generic)
endOfFileToken :: Token
endOfFileToken = ContentText ""
data Attr = Attr !AttrName !AttrValue
deriving (Show, Eq, Ord)
instance NFData Token where
rnf (Comment b) = rnf $ B.toLazyText b
rnf _ = ()
token :: Parser Token
token = dataState
dataState :: Parser Token
dataState = do
content <- takeWhile (/= '<')
if not $ T.null content
then return $ ContentText content
else char '<' >> tagOpen
tagOpen :: Parser Token
tagOpen =
(char '!' >> markupDeclOpen)
<|> (char '/' >> endTagOpen)
<|> (char '?' >> bogusComment mempty)
<|> tagNameOpen
<|> other
where
other = do
return $ ContentChar '<'
endTagOpen :: Parser Token
endTagOpen = tagNameClose
isWhitespace :: Char -> Bool
isWhitespace '\x09' = True
isWhitespace '\x0a' = True
isWhitespace '\x0c' = True
isWhitespace ' ' = True
isWhitespace _ = False
orC :: (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
orC f g c = f c || g c
{-# INLINE orC #-}
isC :: Char -> Char -> Bool
isC = (==)
{-# INLINE isC #-}
tagNameOpen :: Parser Token
tagNameOpen = do
tag <- tagName'
id $ (satisfy isWhitespace >> beforeAttrName tag [])
<|> (char '/' >> selfClosingStartTag tag [])
<|> (char '>' >> return (TagOpen tag []))
tagNameClose :: Parser Token
tagNameClose = do
tag <- tagName'
char '>' >> return (TagClose tag)
tagName' :: Parser Text
tagName' = do
c <- peekChar'
guard $ isAsciiUpper c || isAsciiLower c
takeWhile $ not . (isWhitespace `orC` isC '/' `orC` isC '<' `orC` isC '>')
selfClosingStartTag :: TagName -> [Attr] -> Parser Token
selfClosingStartTag tag attrs = do
(char '>' >> return (TagSelfClose tag attrs))
<|> (endOfInput >> return endOfFileToken)
<|> beforeAttrName tag attrs
beforeAttrName :: TagName -> [Attr] -> Parser Token
beforeAttrName tag attrs = do
skipWhile isWhitespace
id $ (char '/' >> selfClosingStartTag tag attrs)
<|> (char '>' >> return (TagOpen tag attrs))
<|> attrName tag attrs
attrName :: TagName -> [Attr] -> Parser Token
attrName tag attrs = do
name <- takeWhile $ not . (isWhitespace `orC` isC '/' `orC` isC '=' `orC` isC '>')
id $ (endOfInput >> afterAttrName tag attrs name)
<|> (char '=' >> beforeAttrValue tag attrs name)
<|> try (do mc <- peekChar
case mc of
Just c | notNameChar c -> afterAttrName tag attrs name
_ -> empty)
where notNameChar = isWhitespace `orC` isC '/' `orC` isC '>'
afterAttrName :: TagName -> [Attr] -> AttrName -> Parser Token
afterAttrName tag attrs name = do
skipWhile isWhitespace
id $ (char '/' >> selfClosingStartTag tag attrs)
<|> (char '=' >> beforeAttrValue tag attrs name)
<|> (char '>' >> return (TagOpen tag (Attr name T.empty : attrs)))
<|> (endOfInput >> return endOfFileToken)
<|> attrName tag (Attr name T.empty : attrs)
beforeAttrValue :: TagName -> [Attr] -> AttrName -> Parser Token
beforeAttrValue tag attrs name = do
skipWhile isWhitespace
id $ (char '"' >> attrValueDQuoted tag attrs name)
<|> (char '\'' >> attrValueSQuoted tag attrs name)
<|> (char '>' >> return (TagOpen tag (Attr name T.empty : attrs)))
<|> attrValueUnquoted tag attrs name
attrValueDQuoted :: TagName -> [Attr] -> AttrName -> Parser Token
attrValueDQuoted tag attrs name = do
value <- takeWhile (/= '"')
_ <- char '"'
afterAttrValueQuoted tag attrs name value
attrValueSQuoted :: TagName -> [Attr] -> AttrName -> Parser Token
attrValueSQuoted tag attrs name = do
value <- takeWhile (/= '\'')
_ <- char '\''
afterAttrValueQuoted tag attrs name value
attrValueUnquoted :: TagName -> [Attr] -> AttrName -> Parser Token
attrValueUnquoted tag attrs name = do
value <- takeTill $ isWhitespace `orC` isC '>'
id $ (satisfy isWhitespace >> beforeAttrName tag attrs)
<|> (char '>' >> return (TagOpen tag (Attr name value : attrs)))
<|> (endOfInput >> return endOfFileToken)
afterAttrValueQuoted :: TagName -> [Attr] -> AttrName -> AttrValue -> Parser Token
afterAttrValueQuoted tag attrs name value =
(satisfy isWhitespace >> beforeAttrName tag attrs')
<|> (char '/' >> selfClosingStartTag tag attrs')
<|> (char '>' >> return (TagOpen tag attrs'))
<|> (endOfInput >> return endOfFileToken)
where attrs' = Attr name value : attrs
bogusComment :: Builder -> Parser Token
bogusComment content = do
(char '>' >> return (Comment content))
<|> (endOfInput >> return (Comment content))
<|> (char '\x00' >> bogusComment (content <> "\xfffd"))
<|> (anyChar >>= \c -> bogusComment (content <> B.singleton c))
markupDeclOpen :: Parser Token
markupDeclOpen =
try comment_
<|> try docType
<|> bogusComment mempty
where
comment_ = char '-' >> char '-' >> commentStart
docType = do
s <- take 7
guard $ T.toLower s == "doctype"
doctype
commentStart :: Parser Token
commentStart = do
(char '-' >> commentStartDash)
<|> (char '>' >> return (Comment mempty))
<|> comment mempty
commentStartDash :: Parser Token
commentStartDash =
(char '-' >> commentEnd mempty)
<|> (char '>' >> return (Comment mempty))
<|> (endOfInput >> return (Comment mempty))
<|> (comment (B.singleton '-'))
comment :: Builder -> Parser Token
comment content0 = do
content <- B.fromText <$> takeWhile (not . (isC '-' `orC` isC '\x00' `orC` isC '<'))
id $ (char '<' >> commentLessThan (content0 <> content <> "<"))
<|> (char '-' >> commentEndDash (content0 <> content))
<|> (char '\x00' >> comment (content0 <> content <> B.singleton '\xfffd'))
<|> (endOfInput >> return (Comment $ content0 <> content))
commentLessThan :: Builder -> Parser Token
commentLessThan content =
(char '!' >> commentLessThanBang (content <> "!"))
<|> (char '<' >> commentLessThan (content <> "<"))
<|> comment content
commentLessThanBang :: Builder -> Parser Token
commentLessThanBang content =
(char '-' >> commentLessThanBangDash content)
<|> comment content
commentLessThanBangDash :: Builder -> Parser Token
commentLessThanBangDash content =
(char '-' >> commentLessThanBangDashDash content)
<|> commentEndDash content
commentLessThanBangDashDash :: Builder -> Parser Token
commentLessThanBangDashDash content =
(char '>' >> comment content)
<|> (endOfInput >> comment content)
<|> commentEnd content
commentEndDash :: Builder -> Parser Token
commentEndDash content = do
(char '-' >> commentEnd content)
<|> (endOfInput >> return (Comment content))
<|> (comment (content <> "-"))
commentEnd :: Builder -> Parser Token
commentEnd content = do
(char '>' >> return (Comment content))
<|> (char '!' >> commentEndBang content)
<|> (char '-' >> commentEnd (content <> "-"))
<|> (endOfInput >> return (Comment content))
<|> (comment (content <> "--"))
commentEndBang :: Builder -> Parser Token
commentEndBang content = do
(char '-' >> commentEndDash (content <> "--!"))
<|> (char '>' >> return (Comment content))
<|> (endOfInput >> return (Comment content))
<|> (comment (content <> "--!"))
doctype :: Parser Token
doctype = do
content <- takeTill (=='>')
_ <- char '>'
return $ Doctype content
parseTokens :: Text -> [Token]
parseTokens = unfoldr f
where
f :: Text -> Maybe (Token, Text)
f t
| T.null t = Nothing
| otherwise =
case parse token t of
Done rest tok -> Just (tok, rest)
Partial cont ->
case cont mempty of
Done rest tok -> Just (tok, rest)
_ -> Nothing
_ -> Nothing
parseTokensLazy :: TL.Text -> [Token]
parseTokensLazy = unfoldr f
where
f :: TL.Text -> Maybe (Token, TL.Text)
f t
| TL.null t = Nothing
| otherwise =
case AL.parse token t of
AL.Done rest tok -> Just (tok, rest)
_ -> Nothing
renderTokens :: [Token] -> TL.Text
renderTokens = mconcat . fmap renderToken
renderToken :: Token -> TL.Text
renderToken = TL.fromStrict . mconcat . \case
(TagOpen n []) -> ["<", n, ">"]
(TagOpen n attrs) -> ["<", n, " ", renderAttrs attrs, ">"]
(TagSelfClose n attrs) -> ["<", n, " ", renderAttrs attrs, " />"]
(TagClose n) -> ["</", n, ">"]
(ContentChar c) -> [T.singleton c]
(ContentText t) -> [t]
(Comment builder) -> ["<!--", TL.toStrict $ B.toLazyText builder, "-->"]
(Doctype t) -> ["<!DOCTYPE", t, ">"]
renderAttrs :: [Attr] -> Text
renderAttrs = T.unwords . fmap renderAttr . reverse
renderAttr :: Attr -> Text
renderAttr (Attr k v) = mconcat [k, "=\"", v, "\""]
canonicalizeTokens :: [Token] -> [Token]
canonicalizeTokens = filter (/= ContentText "") . meldTextTokens
meldTextTokens :: [Token] -> [Token]
meldTextTokens = concatTexts . fmap charToText
where
charToText (ContentChar c) = ContentText (T.singleton c)
charToText t = t
concatTexts = \case
(ContentText t : ContentText t' : ts) -> concatTexts $ ContentText (t <> t') : ts
(t : ts) -> t : concatTexts ts
[] -> []