module ListT.HTMLParser
(
Parser,
Error,
ErrorDetails(..),
run,
-- * Parsers
eoi,
token,
openingTag,
closingTag,
text,
comment,
html,
-- * Combinators
many1,
manyTill,
skipTill,
total,
)
where
import BasePrelude hiding (uncons, cons)
import MTLPrelude hiding (Error, shift)
import Control.Monad.Trans.Either hiding (left, right)
import ListT (ListT)
import Data.Text (Text)
import qualified Data.Text.Lazy.Builder as Text (Builder)
import qualified Data.Text.Lazy.Builder as Text.Builder
import qualified ListT as L
import qualified HTMLTokenizer.Parser as HT
import qualified ListT.HTMLParser.Renderer as Renderer
-- |
-- A backtracking HTML-tokens stream parser.
newtype Parser m a =
Parser { unwrap :: EitherT Error (StateT (ListT m HT.Token, [HT.Token]) m) a }
deriving (Functor, Applicative, MonadError Error)
-- |
-- A possibly detailed parser error.
-- When 'mzero' or 'empty' is used, an error value of 'Nothing' is produced.
type Error =
Maybe ErrorDetails
data ErrorDetails =
-- | A text message
ErrorDetails_Message Text |
-- | Unexpected token
ErrorDetails_UnexpectedToken |
-- | End of input
ErrorDetails_EOI
deriving (Show, Eq)
instance Monad m => Monad (Parser m) where
return =
Parser . return
(>>=) a b =
Parser $ unwrap a >>= unwrap . b
fail a =
throwError $ Just $ ErrorDetails_Message $ fromString a
instance Monad m => Alternative (Parser m) where
empty =
Parser $ EitherT $ return $ Left Nothing
(<|>) a b =
Parser $ EitherT $ StateT $ \(incoming, backtrack) -> do
(aResult, (incoming', backtrack')) <- flip runStateT (incoming, []) $ runEitherT $ unwrap $ a
(result'', (incoming'', backtrack'')) <-
case aResult of
Left _ -> do
flip runStateT (foldl' (flip L.cons) incoming' backtrack', []) $ runEitherT $ unwrap $ b
Right aResult -> do
return (Right aResult, (incoming', backtrack'))
return (result'', (incoming'', backtrack'' <> backtrack))
instance Monad m => MonadPlus (Parser m) where
mzero = empty
mplus = (<|>)
-- |
-- Run a parser on a stream of HTML tokens,
-- consuming only as many as needed.
run :: Monad m => Parser m a -> ListT m HT.Token -> m (Either Error a)
run p l =
flip evalStateT (l, []) $ runEitherT $ unwrap $ p
-- |
-- End of input.
eoi :: Monad m => Parser m ()
eoi =
token $> () <|> pure ()
-- |
-- Any HTML token.
token :: Monad m => Parser m HT.Token
token =
Parser $ EitherT $ StateT $ \(incoming, backtrack) ->
liftM (maybe (Left (Just ErrorDetails_EOI), (incoming, backtrack))
(\(a, incoming') -> (Right a, (incoming', a : backtrack)))) $
L.uncons incoming
-- |
-- An opening tag.
openingTag :: Monad m => Parser m HT.OpeningTag
openingTag =
token >>= \case
HT.Token_OpeningTag x -> return x
_ -> throwError (Just ErrorDetails_UnexpectedToken)
-- |
-- A closing tag.
closingTag :: Monad m => Parser m HT.ClosingTag
closingTag =
token >>= \case
HT.Token_ClosingTag x -> return x
_ -> throwError (Just ErrorDetails_UnexpectedToken)
-- |
-- A text between tags with HTML-entities decoded.
text :: Monad m => Parser m Text
text =
token >>= \case
HT.Token_Text x -> return x
_ -> throwError (Just ErrorDetails_UnexpectedToken)
-- |
-- Contents of a comment.
comment :: Monad m => Parser m Text
comment =
token >>= \case
HT.Token_Comment x -> return x
_ -> throwError (Just ErrorDetails_UnexpectedToken)
-- |
-- Apply a parser at least one time.
many1 :: Monad m => Parser m a -> Parser m [a]
many1 a =
(:) <$> a <*> many a
-- |
-- Apply a parser multiple times until another parser is satisfied.
-- Returns results of both parsers.
manyTill :: Monad m => Parser m a -> Parser m b -> Parser m ([a], b)
manyTill a b =
fix $ \loop ->
([],) <$> b <|>
(\a (al, b) -> (a : al, b)) <$> a <*> loop
-- |
-- Skip any tokens until the provided parser is satisfied.
skipTill :: Monad m => Parser m a -> Parser m a
skipTill a =
fix $ \loop ->
a <|> (token *> loop)
-- |
-- Greedily consume all the input until the end,
-- while running the provided parser.
-- Same as:
--
-- > theParser <* eoi
total :: Monad m => Parser m a -> Parser m a
total a =
a <* eoi
-- |
-- The textual HTML representation of a proper HTML tree node.
--
-- Useful for consuming HTML-formatted snippets.
html :: Monad m => Parser m Text.Builder
html =
enclosingTag <|> brokenOpenTag <|> text' <|> comment'
where
enclosingTag =
do
ot@(n, _, False) <- openingTag
theHTML <- mconcat <$> many html
ct <- closingTag
guard $ ct == n
return $ Renderer.openingTag ot <> theHTML <> Renderer.closingTag ct
brokenOpenTag =
Renderer.openingTag . repair <$> openingTag
where
repair (name, attrs, _) = (name, attrs, True)
text' =
Renderer.text <$> text
comment' =
Renderer.comment <$> comment