{-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE TypeFamilies #-}
module Data.XML.Parser.High
( module Data.XML.Parser.High.AttrParser
, module Data.XML.Parser.High.NameParser
, Prolog(..)
, Token(..)
, TokenParser()
, ContentParser()
, noContent
, withContent
, anyContent
, runTokenParser
, prolog
, instruction
, textContent
, textContent'
, tag
, tag'
, anyTag
, anyToken
, anyToken'
) where
import Control.Applicative
import Control.Arrow
import Control.Monad.Compat
import Control.Monad.Fail.Compat
import Data.Function
import Data.Map (Map)
import qualified Data.Map as Map
import Data.Maybe
import Data.String
import Data.Text (Text)
import qualified Data.Text as Text
import Data.XML.Parser.High.AttrParser
import Data.XML.Parser.High.NameParser
import Data.XML.Parser.Low
import qualified Data.XML.Parser.Mid as L1
import Data.XML.Parser.Mid.Attribute
import Prelude ()
import Prelude.Compat
import Text.Parser.Char
import Text.Parser.Combinators
import Text.ParserCombinators.ReadP (readP_to_S)
data Prolog = Prolog
{ prologXmlDeclaration :: Maybe L1.XMLDeclaration
, prologInstructions :: [L1.Instruction]
, prologDoctype :: Maybe L1.Doctype
} deriving (Eq, Ord, Read, Show)
data Token
= TokenProlog Prolog
| TokenInstruction L1.Instruction
| TokenTag QName (Map QName Text) [Token]
| TokenTextContent Text
deriving (Eq, Ord, Read, Show)
newtype TokenParser m a = TokenParser { runTokenParser :: m a }
deriving instance Functor m => Functor (TokenParser m)
deriving instance Applicative m => Applicative (TokenParser m)
deriving instance Alternative m => Alternative (TokenParser m)
deriving instance Monad m => Monad (TokenParser m)
instance (Parsing m, Monad m) => MonadFail (TokenParser m) where
fail = TokenParser . unexpected
data ContentParser m a
= NoContent (m a)
| AnyContent ([Token] -> m a)
| WithContent (TokenParser m a)
deriving instance Functor m => Functor (ContentParser m)
withContent :: TokenParser m a -> ContentParser m a
withContent = WithContent
noContent :: Applicative m => ContentParser m ()
noContent = NoContent $ pure ()
anyContent :: CharParsing m => Monad m => ContentParser m ()
anyContent = AnyContent $ const $ pure ()
instruction :: CharParsing m => Monad m => TokenParser m L1.Instruction
instruction = TokenParser $ do
skipCommentsWhitespace
L1.runTokenParser L1.tokenInstruction
prolog :: CharParsing m => Monad m => TokenParser m Prolog
prolog = TokenParser $ do
xmlDeclaration <- optional $ L1.runTokenParser L1.tokenXmlDeclaration
skipCommentsWhitespace
instructions <- runTokenParser $ many instruction
doctype <- optional $ do
skipCommentsWhitespace
L1.runTokenParser L1.tokenDoctype
when (isNothing xmlDeclaration && null instructions && isNothing doctype)
$ unexpected "Expected XML prolog"
return $ Prolog xmlDeclaration instructions doctype
textContent :: CharParsing m => Monad m => EntityDecoder -> TokenParser m Text
textContent entityDecoder = TokenParser $ mconcat <$> do
skipComments
(textualData <|> L1.runTokenParser L1.tokenCdata) `sepBy1` L1.runTokenParser L1.tokenComment
where textualData = expandContents entityDecoder =<< L1.runTokenParser L1.tokenData
textContent' :: CharParsing m => Monad m => TokenParser m Text
textContent' = textContent decodeStandardEntities
normalizeAttributes :: EntityDecoder -> [Attribute] -> Map QName Text
normalizeAttributes entityDecoder attributes = Map.fromList $ do
Attribute name contents <- attributes
value <- maybeToList $ expandContents entityDecoder contents
return (name, value)
tag :: CharParsing m => Monad m
=> EntityDecoder
-> NameParser a
-> (a -> AttrParser b)
-> (b -> ContentParser m c)
-> TokenParser m c
tag entityDecoder parseName parseAttributes parseContent = parseStartToEnd <|> parseEmptyElement where
parseStartToEnd = TokenParser $ do
skipCommentsWhitespace
L1.StartTag name attributes <- L1.runTokenParser L1.tokenStartTag
a <- processName name
b <- processAttributes a attributes
c <- case parseContent b of
NoContent f -> f
AnyContent f -> f =<< runTokenParser (many $ anyToken entityDecoder)
WithContent parser -> runTokenParser parser
skipCommentsWhitespace
L1.runTokenParser $ do
name' <- L1.tokenEndTag
when (name /= name') $ fail "Invalid end tag name"
return c
parseEmptyElement = TokenParser $ do
skipCommentsWhitespace
L1.EmptyElementTag name attributes <- L1.runTokenParser L1.tokenEmptyElementTag
a <- processName name
b <- processAttributes a attributes
case parseContent b of
NoContent f -> f
AnyContent f -> f mempty
WithContent parser -> unexpected "Expected non-empty tag"
processName name = runNameParser parseName name
& either unexpected return
processAttributes state attributes = runAttrParser (parseAttributes state) (normalizeAttributes entityDecoder attributes)
& either unexpected return
tag' :: CharParsing m => Monad m
=> NameParser a
-> AttrParser b
-> ContentParser m c
-> TokenParser m c
tag' parseName parseAttributes parseBody = tag decodeStandardEntities parseName (const parseAttributes) (const parseBody)
anyTag :: CharParsing m => Monad m => TokenParser m ()
anyTag = tag' anyName anyAttr anyContent
anyToken :: CharParsing m => Monad m => EntityDecoder -> TokenParser m Token
anyToken entityDecoder = (TokenProlog <$> prolog)
<|> (TokenInstruction <$> instruction)
<|> tokenTag
<|> (TokenTextContent <$> textContent entityDecoder)
where tokenTag = tag entityDecoder anyName (\name -> (name,) <$> forwardAttrs) $ \(name, attributes) ->
TokenTag name attributes <$> AnyContent pure
forwardAttrs = AttrParser Right
anyToken' :: CharParsing m => Monad m => TokenParser m Token
anyToken' = anyToken decodeStandardEntities
skipComments :: CharParsing m => Monad m => m ()
skipComments = void $ many $ L1.runTokenParser L1.tokenComment
skipCommentsWhitespace :: CharParsing m => Monad m => m ()
skipCommentsWhitespace = void $ many $ void (L1.runTokenParser L1.tokenComment) <|> void tokenWhitespace
decodeStandardEntities :: EntityDecoder
decodeStandardEntities = decodePredefinedEntities <> decodeHtmlEntities