{-# LANGUAGE OverloadedStrings, TupleSections, ViewPatterns #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE RecordWildCards #-}
module Text.HTML.TagStream
( Token (..)
, tokenStream
) where
import Control.Applicative
import Control.Monad (unless)
import Control.Monad.Trans.Resource (MonadThrow)
import Data.Char
import qualified Data.Conduit.List as CL
import Data.Attoparsec.Text
import Data.Conduit
import qualified Data.Conduit.Attoparsec as CA
import Data.Maybe (fromMaybe, isJust)
import Data.Monoid ((<>))
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.Builder as B
import qualified Text.XML.Stream.Parse as XML
import Data.Map (Map)
import qualified Data.Map.Strict as Map
import Control.Arrow (first)
data Token
= TagOpen Text (Map Text Text) Bool
| TagClose Text
| Text Text
| Comment Text
| Special Text Text
| Incomplete Text
deriving (Eq, Show)
data TagType
= TagTypeClose
| TagTypeSpecial
| TagTypeNormal
quoted :: Char -> Parser Text
quoted q = T.append <$> takeTill (in2 ('\\',q))
<*> ( char q *> pure ""
<|> char '\\' *> atLeast 1 (quoted q) )
quotedOr :: Parser Text -> Parser Text
quotedOr p = maybeP (satisfy (in2 ('"','\''))) >>=
maybe p quoted
attrValue :: Parser Text
attrValue = quotedOr $ takeTill ((=='>') ||. isSpace)
attrName :: Parser Text
attrName = quotedOr $
T.cons <$> satisfy (/='>')
<*> takeTill (in3 ('/','>','=') ||. isSpace)
tagEnd :: Parser Bool
tagEnd = char '>' *> pure False
<|> string "/>" *> pure True
attr :: Parser (Text, Text)
attr = (,) <$> attrName <* skipSpace
<*> ( boolP (char '=') >>=
cond (skipSpace *> attrValue)
(pure "")
)
attrs :: Parser (Map Text Text, Bool)
attrs = loop Map.empty
where
loop acc = skipSpace *> (Left <$> tagEnd <|> Right <$> attr) >>=
either
(return . (acc,))
(\(key, value) -> loop $ Map.insert key value acc)
comment :: Parser Token
comment = Comment <$> comment'
where comment' = T.append <$> takeTill (=='-')
<*> ( string "-->" *> return ""
<|> atLeast 1 comment' )
special :: Parser Token
special = Special
<$> ( T.cons <$> satisfy (not . ((=='-') ||. isSpace))
<*> takeTill ((=='>') ||. isSpace)
<* skipSpace )
<*> takeTill (=='>') <* char '>'
tag :: Parser Token
tag = do
t <- string "/" *> return TagTypeClose
<|> string "!" *> return TagTypeSpecial
<|> return TagTypeNormal
case t of
TagTypeClose ->
TagClose <$> takeTill (=='>')
<* char '>'
TagTypeSpecial -> boolP (string "--") >>=
cond comment special
TagTypeNormal -> do
name <- takeTill (in3 ('<','>','/') ||. isSpace)
(as, close) <- attrs
return $ TagOpen name (Map.map decodeString as) close
incomplete :: Parser Token
incomplete = Incomplete . T.cons '<' <$> takeText
text :: Parser Token
text = Text <$> atLeast 1 (takeTill (=='<'))
decodeEntity :: MonadThrow m => Text -> m Text
decodeEntity entity =
runConduit
$ CL.sourceList ["&",entity,";"]
.| XML.parseText' XML.def { XML.psDecodeEntities = XML.decodeHtmlEntities }
.| XML.content
token :: Parser Token
token = char '<' *> (tag <|> incomplete)
<|> text
tillScriptEnd :: Token -> Parser [Token]
tillScriptEnd open =
loop mempty
where
loop acc = do
chunk <- takeTill (== '<')
let acc' = acc <> B.fromText chunk
finish = pure [open, Text $ L.toStrict $ B.toLazyText acc', TagClose "script"]
hasContent = (string "/script>" *> finish) <|> loop acc'
(char '<' *> hasContent) <|> finish
tokens :: Parser [Token]
tokens = do
t <- token
case t of
TagOpen "script" _ False -> tillScriptEnd t
Text text0 -> do
let parseText = do
Text text <- token
pure text
texts <- many parseText
pure [Text $ decodeString $ T.concat $ text0 : texts]
_ -> pure [t]
atLeast :: Int -> Parser Text -> Parser Text
atLeast 0 p = p
atLeast n p = T.cons <$> anyChar <*> atLeast (n-1) p
cond :: a -> a -> Bool -> a
cond a1 a2 b = if b then a1 else a2
(||.) :: Applicative f => f Bool -> f Bool -> f Bool
(||.) = liftA2 (||)
in2 :: Eq a => (a,a) -> a -> Bool
in2 (a1,a2) a = a==a1 || a==a2
in3 :: Eq a => (a,a,a) -> a -> Bool
in3 (a1,a2,a3) a = a==a1 || a==a2 || a==a3
boolP :: Parser a -> Parser Bool
boolP p = p *> pure True <|> pure False
maybeP :: Parser a -> Parser (Maybe a)
maybeP p = Just <$> p <|> return Nothing
tokenStream :: Monad m
=> ConduitT Text Token m ()
tokenStream =
CL.filter (not . T.null) .| CA.conduitParserEither tokens .| CL.concatMap go
where
go (Left e) = error $ "html-conduit: parse error that should never happen occurred! " ++ show e
go (Right (_, tokens')) = tokens'
splitAccum :: [Token] -> (Text, [Token])
splitAccum [] = (mempty, [])
splitAccum (reverse -> (Incomplete s : xs)) = (s, reverse xs)
splitAccum tokens = (mempty, tokens)
decodeEntities :: Monad m => ConduitT Token Token m ()
decodeEntities =
start
where
start = await >>= maybe (return ()) (\token' -> start' token' >> start)
start' (Text t) = (yield t >> yieldWhileText) .| decodeEntities' .| CL.mapMaybe go
start' (TagOpen name attrs' bool) = yield (TagOpen name (Map.map decodeString attrs') bool)
start' token' = yield token'
go t
| t == "" = Nothing
| otherwise = Just (Text t)
decodeString :: Text -> Text
decodeString input =
case makeEntityDecoder input of
(value', remainder)
| value' /= mempty -> value' <> decodeString remainder
| otherwise -> input
decodeEntities' :: Monad m => ConduitT Text Text m ()
decodeEntities' =
loop id
where
loop accum = do
mchunk <- await
let chunk = accum $ fromMaybe mempty mchunk
(newStr, remainder) = makeEntityDecoder chunk
yield newStr
if isJust mchunk
then loop (mappend remainder)
else yield remainder
yieldWhileText :: Monad m => ConduitT Token Text m ()
yieldWhileText =
loop
where
loop = await >>= maybe (return ()) go
go (Text t) = yield t >> loop
go token' = leftover token'
makeEntityDecoder :: Text -> (Text, Text)
makeEntityDecoder = first (L.toStrict . B.toLazyText) . go
where
go s =
case T.break (=='&') s of
(_,"") -> (B.fromText s, "")
(before,restPlusAmp@(T.drop 1 -> rest)) ->
case T.break (not . (\c -> isNameChar c || c == '#')) rest of
(_,"") -> (B.fromText before, restPlusAmp)
(entity,after) -> (before1 <> before2, after')
where
before1 = B.fromText before
(before2, after') =
case mdecoded of
Nothing -> first (("&" <> B.fromText entity) <>) (go after)
Just (B.fromText -> decoded) ->
case T.uncons after of
Just (';',validAfter) -> first (decoded <>) (go validAfter)
Just (_invalid,_rest) -> first (decoded <>) (go after)
Nothing -> (mempty, s)
mdecoded =
if entity == mempty
then Nothing
else decodeEntity entity
isNameStart :: Char -> Bool
isNameStart c =
c == ':' ||
c == '_' ||
isAsciiUpper c ||
isAsciiLower c ||
(c >= '\xC0' && c <= '\xD6') ||
(c >= '\xD8' && c <= '\xF6') ||
(c >= '\xF8' && c <= '\x2FF') ||
(c >= '\x370' && c <= '\x37D') ||
(c >= '\x37F' && c <= '\x1FFF') ||
(c >= '\x200C' && c <= '\x200D') ||
(c >= '\x2070' && c <= '\x218F') ||
(c >= '\x2C00' && c <= '\x2FEF') ||
(c >= '\x3001' && c <= '\xD7FF') ||
(c >= '\xF900' && c <= '\xFDCF') ||
(c >= '\xFDF0' && c <= '\xFFFD') ||
(c >= '\x10000' && c <= '\xEFFFF')
isNameChar :: Char -> Bool
isNameChar c =
c == '-' ||
c == '.' ||
c == '\xB7' ||
isDigit c ||
isNameStart c ||
(c >= '\x0300' && c <= '\x036F') ||
(c >= '\x203F' && c <= '\x2040')