module ListT.HTMLParser ( Parser, Error, ErrorDetails(..), run, -- * Parsers eoi, token, openingTag, closingTag, text, comment, html, -- * Combinators many1, manyTill, skipTill, total, ) where import BasePrelude hiding (uncons, cons) import MTLPrelude hiding (Error, shift) import Control.Monad.Trans.Either hiding (left, right) import ListT (ListT) import Data.Text (Text) import qualified Data.Text.Lazy.Builder as Text (Builder) import qualified Data.Text.Lazy.Builder as Text.Builder import qualified ListT as L import qualified HTMLTokenizer.Parser as HT import qualified ListT.HTMLParser.Renderer as Renderer -- | -- A backtracking HTML-tokens stream parser. newtype Parser m a = Parser { unwrap :: EitherT Error (StateT (ListT m HT.Token, [HT.Token]) m) a } deriving (Functor, Applicative, MonadError Error) -- | -- A possibly detailed parser error. -- When 'mzero' or 'empty' is used, an error value of 'Nothing' is produced. type Error = Maybe ErrorDetails data ErrorDetails = -- | A text message ErrorDetails_Message Text | -- | Unexpected token ErrorDetails_UnexpectedToken | -- | End of input ErrorDetails_EOI deriving (Show, Eq) instance Monad m => Monad (Parser m) where return = Parser . return (>>=) a b = Parser $ unwrap a >>= unwrap . b fail a = throwError $ Just $ ErrorDetails_Message $ fromString a instance Monad m => Alternative (Parser m) where empty = Parser $ EitherT $ return $ Left Nothing (<|>) a b = Parser $ EitherT $ StateT $ \(incoming, backtrack) -> do (aResult, (incoming', backtrack')) <- flip runStateT (incoming, []) $ runEitherT $ unwrap $ a (result'', (incoming'', backtrack'')) <- case aResult of Left _ -> do flip runStateT (foldl' (flip L.cons) incoming' backtrack', []) $ runEitherT $ unwrap $ b Right aResult -> do return (Right aResult, (incoming', backtrack')) return (result'', (incoming'', backtrack'' <> backtrack)) instance Monad m => MonadPlus (Parser m) where mzero = empty mplus = (<|>) -- | -- Run a parser on a stream of HTML tokens, -- consuming only as many as needed. run :: Monad m => Parser m a -> ListT m HT.Token -> m (Either Error a) run p l = flip evalStateT (l, []) $ runEitherT $ unwrap $ p -- | -- End of input. eoi :: Monad m => Parser m () eoi = token $> () <|> pure () -- | -- Any HTML token. token :: Monad m => Parser m HT.Token token = Parser $ EitherT $ StateT $ \(incoming, backtrack) -> liftM (maybe (Left (Just ErrorDetails_EOI), (incoming, backtrack)) (\(a, incoming') -> (Right a, (incoming', a : backtrack)))) $ L.uncons incoming -- | -- An opening tag. openingTag :: Monad m => Parser m HT.OpeningTag openingTag = token >>= \case HT.Token_OpeningTag x -> return x _ -> throwError (Just ErrorDetails_UnexpectedToken) -- | -- A closing tag. closingTag :: Monad m => Parser m HT.ClosingTag closingTag = token >>= \case HT.Token_ClosingTag x -> return x _ -> throwError (Just ErrorDetails_UnexpectedToken) -- | -- A text between tags with HTML-entities decoded. text :: Monad m => Parser m Text text = token >>= \case HT.Token_Text x -> return x _ -> throwError (Just ErrorDetails_UnexpectedToken) -- | -- Contents of a comment. comment :: Monad m => Parser m Text comment = token >>= \case HT.Token_Comment x -> return x _ -> throwError (Just ErrorDetails_UnexpectedToken) -- | -- Apply a parser at least one time. many1 :: Monad m => Parser m a -> Parser m [a] many1 a = (:) <$> a <*> many a -- | -- Apply a parser multiple times until another parser is satisfied. -- Returns results of both parsers. manyTill :: Monad m => Parser m a -> Parser m b -> Parser m ([a], b) manyTill a b = fix $ \loop -> ([],) <$> b <|> (\a (al, b) -> (a : al, b)) <$> a <*> loop -- | -- Skip any tokens until the provided parser is satisfied. skipTill :: Monad m => Parser m a -> Parser m a skipTill a = fix $ \loop -> a <|> (token *> loop) -- | -- Greedily consume all the input until the end, -- while running the provided parser. -- Same as: -- -- > theParser <* eoi total :: Monad m => Parser m a -> Parser m a total a = a <* eoi -- | -- The textual HTML representation of a proper HTML tree node. -- -- Useful for consuming HTML-formatted snippets. html :: Monad m => Parser m Text.Builder html = enclosingTag <|> brokenOpenTag <|> text' <|> comment' where enclosingTag = do ot@(n, _, False) <- openingTag theHTML <- mconcat <$> many html ct <- closingTag guard $ ct == n return $ Renderer.openingTag ot <> theHTML <> Renderer.closingTag ct brokenOpenTag = Renderer.openingTag . repair <$> openingTag where repair (name, attrs, _) = (name, attrs, True) text' = Renderer.text <$> text comment' = Renderer.comment <$> comment