Text/HTML/Parser.hs

{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}

-- | This is a performance-oriented HTML tokenizer aim at web-crawling
-- applications. It follows the HTML5 parsing specification quite closely,
-- so it behaves reasonable well on ill-formed documents from the open Web.
--
module Text.HTML.Parser
    ( -- * Parsing
      parseTokens
    , parseTokensLazy
    , token
      -- * Types
    , Token(..)
    , TagName, AttrName, AttrValue
    , Attr(..)
    ) where

import Data.Char hiding (isSpace)
import Data.List (unfoldr)
import GHC.Generics
import Control.Applicative
import Data.Monoid
import Control.Monad (guard)
import Control.DeepSeq

import Data.Attoparsec.Text
import qualified Data.Attoparsec.Text.Lazy as AL
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import Data.Text.Lazy.Builder (Builder)
import qualified Data.Text.Lazy.Builder as B
import Prelude hiding (take, takeWhile)

-- | A tag name (e.g. @body@)
type TagName   = Text

-- | An attribute name (e.g. @href@)
type AttrName  = Text

-- | The value of an attribute
type AttrValue = Text

-- | An HTML token
data Token
  -- | An opening tag. Attribute ordering is arbitrary.
  = TagOpen !TagName [Attr]
  -- | A closing tag.
  | TagClose !TagName
  -- | The content between tags.
  | ContentText !Text
  -- | A single character of content
  | ContentChar !Char
  -- | Contents of a comment.
  | Comment !Builder
  -- | Doctype
  | Doctype !Text
  deriving (Show, Ord, Eq, Generic)

-- | An attribute of a tag
data Attr = Attr !AttrName !AttrValue
          deriving (Show, Eq, Ord)

instance NFData Token where
    rnf (Comment b) = rnf $ B.toLazyText b
    rnf _           = ()

-- | Parse a single 'Token'.
token :: Parser Token
token = dataState -- Start in the data state.

-- | /§8.2.4.1/: Data state
dataState :: Parser Token
dataState = do
    content <- takeWhile (/= '<')
    if not $ T.null content
      then return $ ContentText content
      else char '<' >> tagOpen

-- | /§8.2.4.3/: Tag open state
tagOpen :: Parser Token
tagOpen =
        (char '!' >> markupDeclOpen)
    <|> (char '/' >> endTagOpen)
    <|> (char '?' >> bogusComment)
    <|> tryStartTag
    <|> other
  where
    tryStartTag = do
        c <- peekChar'
        guard $ isAsciiUpper c || isAsciiLower c
        tagName

    other = do
        return $ ContentChar '<'

-- | /§8.2.4.9/: End tag open state
-- TODO: This isn't right
endTagOpen :: Parser Token
endTagOpen = do
    name <- takeWhile $ \c -> isAsciiUpper c || isAsciiLower c
    char '>'
    return $ TagClose name

-- | /§8.2.4.10/: Tag name state
--
-- deviation: no lower-casing
tagName :: Parser Token
tagName = do
    tag <- takeWhile $ notInClass "\x09\x0a\x0c />"
    id $  (satisfy (inClass "\x09\x0a\x0c ") >> beforeAttrName tag [])
      <|> (char '/' >> selfClosingStartTag tag [])
      <|> (char '>' >> return (TagOpen tag []))

-- | /§8.2.4.43/: Self-closing start tag state
selfClosingStartTag :: TagName -> [Attr] -> Parser Token
selfClosingStartTag tag attrs = do
        (char '>' >> return (TagOpen tag attrs))
    <|> beforeAttrName tag attrs

-- | /§8.2.4.34/: Before attribute name state
--
-- deviation: no lower-casing
beforeAttrName :: TagName -> [Attr] -> Parser Token
beforeAttrName tag attrs = do
    skipWhile $ inClass "\x09\x0a\x0c "
    id $  (char '/' >> selfClosingStartTag tag attrs)
      <|> (char '>' >> return (TagOpen tag attrs))
      -- <|> (char '\x00' >> attrName tag attrs) -- TODO: NULL
      <|> attrName tag attrs

-- | /§8.2.4.35/: Attribute name state
attrName :: TagName -> [Attr] -> Parser Token
attrName tag attrs = do
    name <- takeWhile $ notInClass "\x09\x0a\x0c /=>\x00"
    id $  (satisfy (inClass "\x09\x0a\x0c ") >> afterAttrName tag attrs name)
      <|> (char '/' >> selfClosingStartTag tag attrs)
      <|> (char '=' >> beforeAttrValue tag attrs name)
      <|> (char '>' >> return (TagOpen tag (Attr name T.empty : attrs)))
      -- <|> -- TODO: NULL

-- | /§8.2.4.36/: After attribute name state
afterAttrName :: TagName -> [Attr] -> AttrName -> Parser Token
afterAttrName tag attrs name = do
    skipWhile $ inClass "\x09\x0a\x0c "
    id $  (char '/' >> selfClosingStartTag tag attrs)
      <|> (char '=' >> beforeAttrValue tag attrs name)
      <|> (char '>' >> return (TagOpen tag (Attr name T.empty : attrs)))
      <|> attrName tag (Attr name T.empty : attrs)  -- not exactly sure this is right

-- | /§8.2.4.37/: Before attribute value state
beforeAttrValue :: TagName -> [Attr] -> AttrName -> Parser Token
beforeAttrValue tag attrs name = do
    skipWhile $ inClass "\x09\x0a\x0c "
    id $  (char '"' >> attrValueDQuoted tag attrs name)
      <|> (char '\'' >> attrValueSQuoted tag attrs name)
      <|> (char '>' >> return (TagOpen tag (Attr name T.empty : attrs)))
      <|> attrValueUnquoted tag attrs name

-- | /§8.2.4.38/: Attribute value (double-quoted) state
attrValueDQuoted :: TagName -> [Attr] -> AttrName -> Parser Token
attrValueDQuoted tag attrs name = do
    value <- takeWhile (/= '"')
    char '"'
    afterAttrValueQuoted tag attrs name value

-- | /§8.2.4.39/: Attribute value (single-quoted) state
attrValueSQuoted :: TagName -> [Attr] -> AttrName -> Parser Token
attrValueSQuoted tag attrs name = do
    value <- takeWhile (/= '\'')
    char '\''
    afterAttrValueQuoted tag attrs name value

-- | /§8.2.4.40/: Attribute value (unquoted) state
attrValueUnquoted :: TagName -> [Attr] -> AttrName -> Parser Token
attrValueUnquoted tag attrs name = do
    value <- takeTill (inClass "\x09\x0a\x0c >")
    id $  (satisfy (inClass "\x09\x0a\x0c ") >> beforeAttrName tag attrs) -- unsure: don't emit?
      <|> (char '>' >> return (TagOpen tag (Attr name value : attrs)))

-- | /§8.2.4.42/: After attribute value (quoted) state
afterAttrValueQuoted :: TagName -> [Attr] -> AttrName -> AttrValue -> Parser Token
afterAttrValueQuoted tag attrs name value =
          (satisfy (inClass "\x09\x0a\x0c ") >> beforeAttrName tag attrs')
      <|> (char '/' >> selfClosingStartTag tag attrs')
      <|> (char '>' >> return (TagOpen tag attrs'))
  where attrs' = Attr name value : attrs

-- | /§8.2.4.45/: Markup declaration open state
markupDeclOpen :: Parser Token
markupDeclOpen =
        try comment
    <|> try docType
        -- TODO: Fix the rest
  where
    comment = string "--" >> commentStart
    docType = do
        -- switching this to asciiCI slowed things down by a factor of two
        s <- take 7
        guard $ T.toLower s == "doctype"
        doctype

-- | /§8.2.4.46/: Comment start state
commentStart :: Parser Token
commentStart = do
          (char '-' >> commentStartDash)
      <|> (char '>' >> return (Comment mempty))
      <|> comment mempty

-- | /§8.2.4.47/: Comment start dash state
commentStartDash :: Parser Token
commentStartDash =
          (char '-' >> commentEnd mempty)
      <|> (char '>' >> return (Comment mempty))
      <|> (do c <- anyChar
              comment (B.singleton '-' <> B.singleton c) )

-- | /§8.2.4.48/: Comment state
comment :: Builder -> Parser Token
comment content0 = do
    content <- B.fromText <$> takeWhile (notInClass "-")
    id $  (char '-' >> commentEndDash (content0 <> content))
      <|> (char '\x00' >> comment (content0 <> content <> B.singleton '\xfffd'))

-- | /§8.2.4.49/: Comment end dash state
commentEndDash :: Builder -> Parser Token
commentEndDash content = do
        (char '-' >> commentEnd content)
    <|> (char '\x00' >> comment (content <> "-\xfffd"))
    <|> (anyChar >>= \c -> comment (content <> "-" <> B.singleton c))

-- | /§8.2.4.50/: Comment end state
commentEnd :: Builder -> Parser Token
commentEnd content = do
        (char '>' >> return (Comment content))
    <|> (char '\x00' >> comment (content <> "-\xfffd"))
    -- <|> ()  TODO: other cases
    <|> (anyChar >>= \c -> comment (content <> "-" <> B.singleton c))

-- | /§8.2.4.52/: DOCTYPE state
-- FIXME
doctype :: Parser Token
doctype = do
    content <- takeTill (=='>')
    char '>'
    return $ Doctype content

-- | /§8.2.4.44/: Bogus comment state
bogusComment :: Parser Token
bogusComment = fail "Bogus comment"

-- | Parse a lazy list of tokens from strict 'Text'.
parseTokens :: Text -> [Token]
parseTokens = unfoldr f
  where
    f :: Text -> Maybe (Token, Text)
    f t
      | T.null t = Nothing
      | otherwise =
        case parse token t of
            Done rest tok -> Just (tok, rest)
            Partial cont  ->
                case cont mempty of
                  Done rest tok -> Just (tok, rest)
                  _             -> Nothing
            _             -> Nothing

-- | Parse a lazy list of tokens from lazy 'TL.Text'.
parseTokensLazy :: TL.Text -> [Token]
parseTokensLazy = unfoldr f
  where
    f :: TL.Text -> Maybe (Token, TL.Text)
    f t
      | TL.null t = Nothing
      | otherwise =
        case AL.parse token t of
            AL.Done rest tok -> Just (tok, rest)
            _                -> Nothing