module Text.HTML.Parser
(
parseTokens
, parseTokensLazy
, token
, Token(..)
, TagName, AttrName, AttrValue
, Attr(..)
) where
import Data.Char hiding (isSpace)
import Data.List (unfoldr)
import GHC.Generics
import Control.Applicative
import Data.Monoid
import Control.Monad (guard)
import Control.DeepSeq
import Data.Attoparsec.Text
import qualified Data.Attoparsec.Text.Lazy as AL
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import Data.Text.Lazy.Builder (Builder)
import qualified Data.Text.Lazy.Builder as B
import Prelude hiding (take, takeWhile)
type TagName = Text
type AttrName = Text
type AttrValue = Text
data Token
= TagOpen !TagName [Attr]
| TagClose !TagName
| ContentText !Text
| ContentChar !Char
| Comment !Builder
| Doctype !Text
deriving (Show, Ord, Eq, Generic)
data Attr = Attr !AttrName !AttrValue
deriving (Show, Eq, Ord)
instance NFData Token where
rnf (Comment b) = rnf $ B.toLazyText b
rnf _ = ()
token :: Parser Token
token = dataState
dataState :: Parser Token
dataState = do
content <- takeWhile (/= '<')
if not $ T.null content
then return $ ContentText content
else char '<' >> tagOpen
tagOpen :: Parser Token
tagOpen =
(char '!' >> markupDeclOpen)
<|> (char '/' >> endTagOpen)
<|> (char '?' >> bogusComment)
<|> tryStartTag
<|> other
where
tryStartTag = do
c <- peekChar'
guard $ isAsciiUpper c || isAsciiLower c
tagName
other = do
return $ ContentChar '<'
endTagOpen :: Parser Token
endTagOpen = do
name <- takeWhile $ \c -> isAsciiUpper c || isAsciiLower c
char '>'
return $ TagClose name
tagName :: Parser Token
tagName = do
tag <- takeWhile $ notInClass "\x09\x0a\x0c />"
id $ (satisfy (inClass "\x09\x0a\x0c ") >> beforeAttrName tag [])
<|> (char '/' >> selfClosingStartTag tag [])
<|> (char '>' >> return (TagOpen tag []))
selfClosingStartTag :: TagName -> [Attr] -> Parser Token
selfClosingStartTag tag attrs = do
(char '>' >> return (TagOpen tag attrs))
<|> beforeAttrName tag attrs
beforeAttrName :: TagName -> [Attr] -> Parser Token
beforeAttrName tag attrs = do
skipWhile $ inClass "\x09\x0a\x0c "
id $ (char '/' >> selfClosingStartTag tag attrs)
<|> (char '>' >> return (TagOpen tag attrs))
<|> attrName tag attrs
attrName :: TagName -> [Attr] -> Parser Token
attrName tag attrs = do
name <- takeWhile $ notInClass "\x09\x0a\x0c /=>\x00"
id $ (satisfy (inClass "\x09\x0a\x0c ") >> afterAttrName tag attrs name)
<|> (char '/' >> selfClosingStartTag tag attrs)
<|> (char '=' >> beforeAttrValue tag attrs name)
<|> (char '>' >> return (TagOpen tag (Attr name T.empty : attrs)))
afterAttrName :: TagName -> [Attr] -> AttrName -> Parser Token
afterAttrName tag attrs name = do
skipWhile $ inClass "\x09\x0a\x0c "
id $ (char '/' >> selfClosingStartTag tag attrs)
<|> (char '=' >> beforeAttrValue tag attrs name)
<|> (char '>' >> return (TagOpen tag (Attr name T.empty : attrs)))
<|> attrName tag (Attr name T.empty : attrs)
beforeAttrValue :: TagName -> [Attr] -> AttrName -> Parser Token
beforeAttrValue tag attrs name = do
skipWhile $ inClass "\x09\x0a\x0c "
id $ (char '"' >> attrValueDQuoted tag attrs name)
<|> (char '\'' >> attrValueSQuoted tag attrs name)
<|> (char '>' >> return (TagOpen tag (Attr name T.empty : attrs)))
<|> attrValueUnquoted tag attrs name
attrValueDQuoted :: TagName -> [Attr] -> AttrName -> Parser Token
attrValueDQuoted tag attrs name = do
value <- takeWhile (/= '"')
char '"'
afterAttrValueQuoted tag attrs name value
attrValueSQuoted :: TagName -> [Attr] -> AttrName -> Parser Token
attrValueSQuoted tag attrs name = do
value <- takeWhile (/= '\'')
char '\''
afterAttrValueQuoted tag attrs name value
attrValueUnquoted :: TagName -> [Attr] -> AttrName -> Parser Token
attrValueUnquoted tag attrs name = do
value <- takeTill (inClass "\x09\x0a\x0c >")
id $ (satisfy (inClass "\x09\x0a\x0c ") >> beforeAttrName tag attrs)
<|> (char '>' >> return (TagOpen tag (Attr name value : attrs)))
afterAttrValueQuoted :: TagName -> [Attr] -> AttrName -> AttrValue -> Parser Token
afterAttrValueQuoted tag attrs name value =
(satisfy (inClass "\x09\x0a\x0c ") >> beforeAttrName tag attrs')
<|> (char '/' >> selfClosingStartTag tag attrs')
<|> (char '>' >> return (TagOpen tag attrs'))
where attrs' = Attr name value : attrs
markupDeclOpen :: Parser Token
markupDeclOpen =
try comment
<|> try docType
where
comment = string "--" >> commentStart
docType = do
s <- take 7
guard $ T.toLower s == "doctype"
doctype
commentStart :: Parser Token
commentStart = do
(char '-' >> commentStartDash)
<|> (char '>' >> return (Comment mempty))
<|> comment mempty
commentStartDash :: Parser Token
commentStartDash =
(char '-' >> commentEnd mempty)
<|> (char '>' >> return (Comment mempty))
<|> (do c <- anyChar
comment (B.singleton '-' <> B.singleton c) )
comment :: Builder -> Parser Token
comment content0 = do
content <- B.fromText <$> takeWhile (notInClass "-")
id $ (char '-' >> commentEndDash (content0 <> content))
<|> (char '\x00' >> comment (content0 <> content <> B.singleton '\xfffd'))
commentEndDash :: Builder -> Parser Token
commentEndDash content = do
(char '-' >> commentEnd content)
<|> (char '\x00' >> comment (content <> "-\xfffd"))
<|> (anyChar >>= \c -> comment (content <> "-" <> B.singleton c))
commentEnd :: Builder -> Parser Token
commentEnd content = do
(char '>' >> return (Comment content))
<|> (char '\x00' >> comment (content <> "-\xfffd"))
<|> (anyChar >>= \c -> comment (content <> "-" <> B.singleton c))
doctype :: Parser Token
doctype = do
content <- takeTill (=='>')
char '>'
return $ Doctype content
bogusComment :: Parser Token
bogusComment = fail "Bogus comment"
parseTokens :: Text -> [Token]
parseTokens = unfoldr f
where
f :: Text -> Maybe (Token, Text)
f t
| T.null t = Nothing
| otherwise =
case parse token t of
Done rest tok -> Just (tok, rest)
Partial cont ->
case cont mempty of
Done rest tok -> Just (tok, rest)
_ -> Nothing
_ -> Nothing
parseTokensLazy :: TL.Text -> [Token]
parseTokensLazy = unfoldr f
where
f :: TL.Text -> Maybe (Token, TL.Text)
f t
| TL.null t = Nothing
| otherwise =
case AL.parse token t of
AL.Done rest tok -> Just (tok, rest)
_ -> Nothing