{-# LANGUAGE DeriveFunctor #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE StandaloneDeriving #-} {-# LANGUAGE TupleSections #-} {-# LANGUAGE TypeFamilies #-} -- | High-level XML parsers, built on top of "Data.XML.Parser.Mid": -- -- - entity references are expanded -- - CDATAs are abstracted away -- - comments are ignored -- - whitespace between tokens is ignored -- - duplicate attributes are ignored -- -- All documentation examples assume the following setup: -- -- > :set -XOverloadedStrings -- > import Data.Attoparsec.ByteString module Data.XML.Parser.High ( module Data.XML.Parser.High.AttrParser , module Data.XML.Parser.High.NameParser , Prolog(..) , Token(..) , TokenParser() , ContentParser() , noContent , withContent , anyContent , runTokenParser , prolog , instruction , textContent , textContent' , tag , tag' , anyTag , anyToken , anyToken' ) where import Control.Applicative import Control.Arrow import Control.Monad.Compat import Control.Monad.Fail.Compat import Data.Function import Data.Map (Map) import qualified Data.Map as Map import Data.Maybe import Data.String import Data.Text (Text) import qualified Data.Text as Text import Data.XML.Parser.High.AttrParser import Data.XML.Parser.High.NameParser import Data.XML.Parser.Low import qualified Data.XML.Parser.Mid as L1 import Data.XML.Parser.Mid.Attribute import Prelude () import Prelude.Compat import Text.Parser.Char import Text.Parser.Combinators import Text.ParserCombinators.ReadP (readP_to_S) -- $setup -- >>> :set -XOverloadedStrings -- >>> import Data.Attoparsec.ByteString -- | XML document prolog. data Prolog = Prolog { prologXmlDeclaration :: Maybe L1.XMLDeclaration , prologInstructions :: [L1.Instruction] , prologDoctype :: Maybe L1.Doctype } deriving (Eq, Ord, Read, Show) data Token = TokenProlog Prolog | TokenInstruction L1.Instruction | TokenTag QName (Map QName Text) [Token] | TokenTextContent Text deriving (Eq, Ord, Read, Show) -- | A parser that consumes whole 'Token's. newtype TokenParser m a = TokenParser { runTokenParser :: m a } deriving instance Functor m => Functor (TokenParser m) deriving instance Applicative m => Applicative (TokenParser m) deriving instance Alternative m => Alternative (TokenParser m) deriving instance Monad m => Monad (TokenParser m) instance (Parsing m, Monad m) => MonadFail (TokenParser m) where fail = TokenParser . unexpected -- | How to parse tag content. data ContentParser m a = NoContent (m a) | AnyContent ([Token] -> m a) | WithContent (TokenParser m a) deriving instance Functor m => Functor (ContentParser m) -- | Assert that content is not empty, and parse it using given token parser. -- -- >>> parseOnly (runTokenParser $ tag' anyName anyAttr (withContent textContent')) "body" -- Right "body" -- >>> parseOnly (runTokenParser $ tag' anyName anyAttr (withContent textContent')) "" -- Left ... -- >>> parseOnly (runTokenParser $ tag' anyName anyAttr (withContent textContent')) "" -- Left ... -- >>> parseOnly (runTokenParser $ tag' anyName anyAttr (withContent $ pure ())) "" -- Left ... withContent :: TokenParser m a -> ContentParser m a withContent = WithContent -- | Assert that content is empty. -- -- >>> parseOnly (runTokenParser $ tag' anyName anyAttr noContent) "body" -- Left ... -- >>> parseOnly (runTokenParser $ tag' anyName anyAttr noContent) "" -- Right () -- >>> parseOnly (runTokenParser $ tag' anyName anyAttr noContent) "" -- Right () noContent :: Applicative m => ContentParser m () noContent = NoContent $ pure () -- | Accept any content, including empty content. -- -- >>> parseOnly (runTokenParser $ tag' anyName anyAttr anyContent) "body" -- Right () -- >>> parseOnly (runTokenParser $ tag' anyName anyAttr anyContent) "" -- Right () -- >>> parseOnly (runTokenParser $ tag' anyName anyAttr anyContent) "" -- Right () anyContent :: CharParsing m => Monad m => ContentParser m () anyContent = AnyContent $ const $ pure () -- | Parse a processing instruction. -- -- -- -- >>> parseOnly (runTokenParser instruction) "" -- Right (Instruction "php" "echo 'Hello World!'; ") -- >>> parseOnly (runTokenParser instruction) " " -- Right (Instruction "php" "echo 'Hello World!'; ") instruction :: CharParsing m => Monad m => TokenParser m L1.Instruction instruction = TokenParser $ do skipCommentsWhitespace L1.runTokenParser L1.tokenInstruction -- | -- -- >>> parseOnly (runTokenParser prolog) "" -- Right (Prolog {prologXmlDeclaration = Just (XMLDeclaration "1.0" ...), prologInstructions = [], prologDoctype = Just (Doctype "greeting" ...)}) -- >>> parseOnly (runTokenParser prolog) " " -- Right (Prolog {prologXmlDeclaration = Just (XMLDeclaration "1.0" ...), prologInstructions = [], prologDoctype = Just (Doctype "greeting" ...)}) prolog :: CharParsing m => Monad m => TokenParser m Prolog prolog = TokenParser $ do xmlDeclaration <- optional $ L1.runTokenParser L1.tokenXmlDeclaration skipCommentsWhitespace instructions <- runTokenParser $ many instruction doctype <- optional $ do skipCommentsWhitespace L1.runTokenParser L1.tokenDoctype when (isNothing xmlDeclaration && null instructions && isNothing doctype) $ unexpected "Expected XML prolog" return $ Prolog xmlDeclaration instructions doctype -- | Parse textual content of a tag, including CDATA. textContent :: CharParsing m => Monad m => EntityDecoder -> TokenParser m Text textContent entityDecoder = TokenParser $ mconcat <$> do skipComments (textualData <|> L1.runTokenParser L1.tokenCdata) `sepBy1` L1.runTokenParser L1.tokenComment where textualData = expandContents entityDecoder =<< L1.runTokenParser L1.tokenData -- | Same as @textContent (decodePredefinedEntities <> decodeHtmlEntities)@, provided for convenience. -- -- >>> parseOnly (runTokenParser $ tag' anyName anyAttr (withContent textContent')) "bodyinnerbody]]>" -- Right "bodyinnerbody" textContent' :: CharParsing m => Monad m => TokenParser m Text textContent' = textContent decodeStandardEntities normalizeAttributes :: EntityDecoder -> [Attribute] -> Map QName Text normalizeAttributes entityDecoder attributes = Map.fromList $ do Attribute name contents <- attributes value <- maybeToList $ expandContents entityDecoder contents return (name, value) -- | Generic tag parser. tag :: CharParsing m => Monad m => EntityDecoder -- ^ How to expand entity references -> NameParser a -- ^ How to parse tag name -> (a -> AttrParser b) -- ^ How to parse tag attributes -> (b -> ContentParser m c) -- ^ How to parse tag content -> TokenParser m c tag entityDecoder parseName parseAttributes parseContent = parseStartToEnd <|> parseEmptyElement where parseStartToEnd = TokenParser $ do skipCommentsWhitespace L1.StartTag name attributes <- L1.runTokenParser L1.tokenStartTag a <- processName name b <- processAttributes a attributes c <- case parseContent b of NoContent f -> f AnyContent f -> f =<< runTokenParser (many $ anyToken entityDecoder) WithContent parser -> runTokenParser parser skipCommentsWhitespace L1.runTokenParser $ do name' <- L1.tokenEndTag when (name /= name') $ fail "Invalid end tag name" return c parseEmptyElement = TokenParser $ do skipCommentsWhitespace L1.EmptyElementTag name attributes <- L1.runTokenParser L1.tokenEmptyElementTag a <- processName name b <- processAttributes a attributes case parseContent b of NoContent f -> f AnyContent f -> f mempty WithContent parser -> unexpected "Expected non-empty tag" processName name = runNameParser parseName name & maybe (unexpected "Unexpected name") return processAttributes state attributes = runAttrParser (parseAttributes state) (normalizeAttributes entityDecoder attributes) & maybe (unexpected "Unexpected attributes") return -- | Simplified version of 'tag': -- -- - no state forwarding between name, attributes and content parsers -- - uses @decodePredefinedEntities <> decodeHtmlEntities@ to expand entity references tag' :: CharParsing m => Monad m => NameParser a -- ^ How to parse tag name -> AttrParser b -- ^ How to parse tag attributes -> ContentParser m c -- ^ How to parse tag content -> TokenParser m c tag' parseName parseAttributes parseBody = tag decodeStandardEntities parseName (const parseAttributes) (const parseBody) -- | Parse a tag with any name, any attributes and any content. -- -- >>> parseOnly (runTokenParser anyTag) "body" -- Right () -- >>> parseOnly (runTokenParser anyTag) "" -- Right () -- >>> parseOnly (runTokenParser anyTag) "" -- Right () -- >>> parseOnly (runTokenParser anyTag) "body" -- Right () anyTag :: CharParsing m => Monad m => TokenParser m () anyTag = tag' anyName anyAttr anyContent -- | Parse any 'Token'. anyToken :: CharParsing m => Monad m => EntityDecoder -> TokenParser m Token anyToken entityDecoder = (TokenProlog <$> prolog) <|> (TokenInstruction <$> instruction) <|> tokenTag <|> (TokenTextContent <$> textContent entityDecoder) where tokenTag = tag entityDecoder anyName (\name -> (name,) <$> forwardAttrs) $ \(name, attributes) -> TokenTag name attributes <$> AnyContent pure forwardAttrs = AttrParser Just -- | Same as @anyToken (decodePredefinedEntities <> decodeHtmlEntities)@, provided for convenience. anyToken' :: CharParsing m => Monad m => TokenParser m Token anyToken' = anyToken decodeStandardEntities -- * Private functions skipComments :: CharParsing m => Monad m => m () skipComments = void $ many $ L1.runTokenParser L1.tokenComment skipCommentsWhitespace :: CharParsing m => Monad m => m () skipCommentsWhitespace = void $ many $ void (L1.runTokenParser L1.tokenComment) <|> void tokenWhitespace decodeStandardEntities :: EntityDecoder decodeStandardEntities = decodePredefinedEntities <> decodeHtmlEntities