module Text.HTML.Parser
( TagName, AttrName, AttrValue
, Token(..)
, Attr(..)
, token
, tagStream
) where
import Data.Char hiding (isSpace)
import Data.List (unfoldr)
import GHC.Generics
import Control.Applicative
import Data.Monoid
import Control.Monad (guard)
import Control.DeepSeq
import Data.Attoparsec.Text
import Data.Text (Text)
import qualified Data.Text as T
import Data.Text.Lazy.Builder (Builder)
import qualified Data.Text.Lazy.Builder as B
import Prelude hiding (take, takeWhile)
type TagName = Text
type AttrName = Text
type AttrValue = Text
data Token
= TagOpen !TagName [Attr]
| TagClose !TagName
| ContentChar !Char
| ContentText !Text
| Comment !Builder
| Doctype !Text
deriving (Show, Ord, Eq, Generic)
data Attr = Attr !AttrName !AttrValue
deriving (Show, Eq, Ord)
instance NFData Token where
rnf (Comment b) = rnf $ B.toLazyText b
rnf _ = ()
token :: Parser Token
token = dataState
dataState :: Parser Token
dataState = do
content <- takeWhile (/= '<')
if not $ T.null content
then return $ ContentText content
else char '<' >> tagOpen
tagOpen :: Parser Token
tagOpen =
(char '!' >> markupDeclOpen)
<|> (char '/' >> endTagOpen)
<|> (char '?' >> bogusComment)
<|> tryStartTag
<|> other
where
tryStartTag = do
c <- peekChar'
guard $ isAsciiUpper c || isAsciiLower c
tagName
other = do
return $ ContentChar '<'
endTagOpen :: Parser Token
endTagOpen = do
name <- takeWhile $ \c -> isAsciiUpper c || isAsciiLower c
char '>'
return $ TagClose name
tagName :: Parser Token
tagName = do
tag <- takeWhile $ notInClass "\x09\x0a\x0c />"
id $ (satisfy (inClass "\x09\x0a\x0c ") >> beforeAttrName tag [])
<|> (char '/' >> selfClosingStartTag tag [])
<|> (char '>' >> return (TagOpen tag []))
selfClosingStartTag :: TagName -> [Attr] -> Parser Token
selfClosingStartTag tag attrs = do
(char '>' >> return (TagOpen tag attrs))
<|> beforeAttrName tag attrs
beforeAttrName :: TagName -> [Attr] -> Parser Token
beforeAttrName tag attrs = do
skipWhile $ inClass "\x09\x0a\x0c "
id $ (char '/' >> selfClosingStartTag tag attrs)
<|> (char '>' >> return (TagOpen tag attrs))
<|> attrName tag attrs
attrName :: TagName -> [Attr] -> Parser Token
attrName tag attrs = do
name <- takeWhile $ notInClass "\x09\x0a\x0c /=>\x00"
id $ (satisfy (inClass "\x09\x0a\x0c ") >> afterAttrName tag attrs name)
<|> (char '/' >> selfClosingStartTag tag attrs)
<|> (char '=' >> beforeAttrValue tag attrs name)
<|> (char '>' >> return (TagOpen tag (Attr name T.empty : attrs)))
afterAttrName :: TagName -> [Attr] -> AttrName -> Parser Token
afterAttrName tag attrs name = do
skipWhile $ inClass "\x09\x0a\x0c "
id $ (char '/' >> selfClosingStartTag tag attrs)
<|> (char '=' >> beforeAttrValue tag attrs name)
<|> (char '>' >> return (TagOpen tag (Attr name T.empty : attrs)))
<|> attrName tag (Attr name T.empty : attrs)
beforeAttrValue :: TagName -> [Attr] -> AttrName -> Parser Token
beforeAttrValue tag attrs name = do
skipWhile $ inClass "\x09\x0a\x0c "
id $ (char '"' >> attrValueDQuoted tag attrs name)
<|> (char '\'' >> attrValueSQuoted tag attrs name)
<|> (char '>' >> return (TagOpen tag (Attr name T.empty : attrs)))
<|> attrValueUnquoted tag attrs name
attrValueDQuoted :: TagName -> [Attr] -> AttrName -> Parser Token
attrValueDQuoted tag attrs name = do
value <- takeWhile (/= '"')
char '"'
afterAttrValueQuoted tag attrs name value
attrValueSQuoted :: TagName -> [Attr] -> AttrName -> Parser Token
attrValueSQuoted tag attrs name = do
value <- takeWhile (/= '\'')
char '\''
afterAttrValueQuoted tag attrs name value
attrValueUnquoted :: TagName -> [Attr] -> AttrName -> Parser Token
attrValueUnquoted tag attrs name = do
value <- takeTill (inClass "\x09\x0a\x0c >")
id $ (satisfy (inClass "\x09\x0a\x0c ") >> beforeAttrName tag attrs)
<|> (char '>' >> return (TagOpen tag (Attr name value : attrs)))
afterAttrValueQuoted :: TagName -> [Attr] -> AttrName -> AttrValue -> Parser Token
afterAttrValueQuoted tag attrs name value =
(satisfy (inClass "\x09\x0a\x0c ") >> beforeAttrName tag attrs')
<|> (char '/' >> selfClosingStartTag tag attrs')
<|> (char '>' >> return (TagOpen tag attrs'))
where attrs' = Attr name value : attrs
markupDeclOpen :: Parser Token
markupDeclOpen =
try comment
<|> try docType
where
comment = string "--" >> commentStart
docType = do
s <- take 7
guard $ T.toLower s == "doctype"
doctype
commentStart :: Parser Token
commentStart = do
(char '-' >> commentStartDash)
<|> (char '>' >> return (Comment mempty))
commentStartDash :: Parser Token
commentStartDash =
(char '-' >> commentEnd mempty)
<|> (char '>' >> return (Comment mempty))
<|> (do c <- anyChar
comment (B.singleton '-' <> B.singleton c) )
comment :: Builder -> Parser Token
comment content0 = do
content <- B.fromText <$> takeWhile (notInClass "-")
id $ (char '-' >> commentEndDash (content0 <> content))
<|> (char '\x00' >> comment (content0 <> content <> B.singleton '\xfffd'))
commentEndDash :: Builder -> Parser Token
commentEndDash content = do
(char '-' >> commentEnd content)
<|> (char '\x00' >> comment (content <> "-\xfffd"))
<|> (anyChar >>= \c -> comment (content <> "-" <> B.singleton c))
commentEnd :: Builder -> Parser Token
commentEnd content = do
(char '>' >> return (Comment content))
<|> (char '\x00' >> comment (content <> "-\xfffd"))
<|> (anyChar >>= \c -> comment (content <> "-" <> B.singleton c))
doctype :: Parser Token
doctype = do
content <- takeTill (=='>')
char '>'
return $ Doctype content
bogusComment :: Parser Token
bogusComment = fail "Bogus comment"
tagStream :: Text -> [Token]
tagStream = unfoldr f
where
f :: Text -> Maybe (Token, Text)
f t
| T.null t = Nothing
| otherwise =
case parse token t of
Done rest tok -> Just (tok, rest)
_ -> Nothing