{-# LANGUAGE DeriveFunctor #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE PatternSynonyms #-} {-# LANGUAGE ScopedTypeVariables #-} -- | XML back and forth! -- -- @xmlbf@ doesn't do any parsing of raw XML on its own. Instead, one should -- rely on libraries like -- [xmlbf-xeno](https://hackage.haskell.org/package/xmlbf-xeno) or -- [xmlbf-xmlhtml](https://hackage.haskell.org/package/xmlbf-xmlhtml) for -- this. -- -- @xmlbf@ provides a 'FromXml' class intended to be used as the familiar -- 'Data.Aeson.FromJSON' from the @aeson@ package. This relies on the -- 'Parser' type and the related tools. -- -- @xmlbf@ provides a 'ToXml' class intended to be used as the familiar -- 'Data.Aeson.toJSON' from the @aeson@ package. -- -- @xmlb@ provides tools like 'dfpos' and 'dfposM' for finding a fixpoint -- of a XML structure. module Xmlbf ( -- * Parsing FromXml(fromXml) , Parser , runParser , pFail , pElement , pAnyElement , pName , pAttr , pAttrs , pChildren , pText , pEndOfInput -- * Rendering , ToXml(toXml) , encode , Node , node , pattern Element , element , element' , pattern Text , text , text' -- * Fixpoints , dfpos , dfposM , dfpre , dfpreM ) where import Control.DeepSeq (NFData(rnf)) import qualified Data.ByteString.Builder as BB import qualified Data.ByteString.Builder.Prim as BBP import qualified Data.Char as Char import Data.Foldable (for_, toList) import Data.Functor.Identity (Identity(Identity), runIdentity) import qualified Data.HashMap.Strict as HM import Data.Monoid ((<>)) import Data.Sequence (Seq) import qualified Data.Sequence as Seq import qualified Data.Text as T import qualified Data.Text.Encoding as T import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Encoding as TL import Data.Traversable (for) import Data.Word (Word8) import Control.Applicative (Alternative(empty, (<|>))) import Control.Monad (MonadPlus(mplus, mzero), join, when) import Control.Monad.Fail (MonadFail(fail)) -------------------------------------------------------------------------------- -- | Either a text or an element node in an XML fragment body. -- -- Construct with 'text' or 'element'. Destruct with 'Text' or 'Element'. data Node = Element' !T.Text !(HM.HashMap T.Text T.Text) ![Node] | Text' !TL.Text deriving (Eq) instance NFData Node where rnf = \case Element' n as cs -> rnf n `seq` rnf as `seq` rnf cs `seq` () Text' t -> rnf t `seq` () instance Show Node where showsPrec n = \x -> showParen (n > 10) $ case x of Text' t -> showString "Text " . showsPrec 0 t Element' t as cs -> showString "Element " . showsPrec 0 t . showChar ' ' . showsPrec 0 (HM.toList as) . showChar ' ' . showsPrec 0 cs -- | Destruct an element 'Node'. pattern Element :: T.Text -> (HM.HashMap T.Text T.Text) -> [Node] -> Node pattern Element t as cs <- Element' t as cs {-# COMPLETE Element #-} -- TODO this leads to silly pattern matching warnings -- | Destruct a text 'Node'. pattern Text :: TL.Text -> Node pattern Text t <- Text' t {-# COMPLETE Text #-} -- TODO this leads to silly pattern matching warnings -- | Case analysis for a 'Node'. node :: (T.Text -> HM.HashMap T.Text T.Text -> [Node] -> a) -- ^ Transform an 'Element' node. -> (TL.Text -> a) -- ^ Transform a 'Text' node. -> Node -> a {-# INLINE node #-} node fe ft = \case Text' t -> ft t Element' t as cs -> fe t as cs -- | Normalizes 'Node's by concatenating consecutive 'Text' nodes. normalize :: [Node] -> [Node] {-# INLINE normalize #-} normalize = \case -- Note that @'Text' ""@ is forbidden by construction, actually. But we do -- take care of it in case the 'Node' was constructed unsafely somehow. Text' "" : ns -> normalize ns Text' a : Text' b : ns -> normalize (text (a <> b) <> ns) Text' a : ns -> Text' a : normalize ns Element' t as cs : ns -> Element' t as (normalize cs) : normalize ns [] -> [] -- | Construct a XML fragment body containing a single 'Text' 'Node', if -- possible. -- -- This function will return empty list if it is not possible to construct the -- 'Text' with the given input. To learn more about /why/ it was not possible to -- construct it, use 'text'' instead. -- -- Using 'text'' rather than 'text' is recommended, so that you are forced to -- acknowledge a failing situation in case it happens. However, 'text' is at -- times more convenient to use, whenever you know the input is valid. text :: TL.Text -> [Node] {-# INLINE text #-} text t = case text' t of Right x -> [x] Left _ -> [] -- | Construct a 'Text' 'Node', if possible. -- -- Returns 'Left' if the 'Text' 'Node' can't be created, with an explanation -- of why. text' :: TL.Text -> Either String Node {-# INLINE text' #-} text' = \case "" -> Left "Empty text" t -> Right (Text' t) -- | Construct a XML fragment body containing a single 'Element' 'Node', if -- possible. -- -- This function will return empty list if it is not possible to construct the -- 'Element' with the given input. To learn more about /why/ it was not possible -- to construct it, use 'element' instead. -- -- Using 'element'' rather than 'element' is recommended, so that you are forced -- to acknowledge a failing situation in case it happens. However, 'element' is -- at times more convenient to use, whenever you know the input is valid. element :: T.Text -- ^ Element' name. -> HM.HashMap T.Text T.Text -- ^ Attributes. -> [Node] -- ^ Children. -> [Node] {-# INLINE element #-} element t hm ns = case element' t hm ns of Right x -> [x] Left _ -> [] -- | Construct an 'Element' 'Node'. -- -- Returns 'Left' if the 'Element' 'Node' can't be created, with an explanation -- of why. element' :: T.Text -- ^ Element' name. -> HM.HashMap T.Text T.Text -- ^ Attributes. -> [Node] -- ^ Children. -> Either String Node element' t0 hm0 ns0 = do when (t0 /= T.strip t0) (Left ("Element name has surrounding whitespace: " ++ show t0)) when (T.null t0) (Left ("Element name is blank: " ++ show t0)) for_ (HM.keys hm0) $ \k -> do when (k /= T.strip k) (Left ("Attribute name has surrounding whitespace: " ++ show k)) when (T.null k) (Left ("Attribute name is blank: " ++ show k)) Right (Element' t0 hm0 (normalize ns0)) -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- -- Parsing class FromXml a where -- | Parses an XML fragment body into a value of type @a@. -- -- If a 'ToXml' instance for @a@ exists, then: -- -- @ -- 'runParser' 'fromXml' ('toXml' a) == 'Right' a -- @ fromXml :: Parser a -- | XML parser monad. To be run with 'runParser'. -- -- You can build a 'Parser' using 'pElement', 'pAttr', 'pAttrs', 'pText', -- 'pFail', or any of the 'Applicative', 'Alternative' or 'Monad' combinators. newtype Parser a = Parser { unParser :: S -> Either String (a, S) } deriving (Functor) -- | Run a parser on an XML fragment body. If the parser fails, then a 'String' -- with an error message is returned. -- -- Notice that this function doesn't enforce that all input is consumed. If you -- want that behavior, then please use 'pEndOfInput' in the given 'Parser'. runParser :: Parser a -> [Node] -> Either String a runParser p0 = fmap fst . unParser p0 . STop . normalize -- | Internal parser state. data S = STop ![Node] -- ^ Parsing the top-level nodes. | SReg !T.Text !(HM.HashMap T.Text T.Text) ![Node] -- ^ Parsing a particular root element. deriving (Show) instance Applicative Parser where {-# INLINE pure #-} pure = \a -> Parser (\s -> Right (a, s)) {-# INLINE (<*>) #-} Parser gf <*> Parser ga = Parser $ \s0 -> do (f, s1) <- gf s0 (a, s2) <- ga s1 pure (f a, s2) instance Monad Parser where {-# INLINE (>>=) #-} Parser ga >>= k = Parser $ \s0 -> do (a, s1) <- ga s0 unParser (k a) s1 fail = pFail instance MonadFail Parser where fail = pFail -- | Backtracks. instance Alternative Parser where {-# INLINE empty #-} empty = Parser (\_ -> Left "empty") {-# INLINE (<|>) #-} Parser a <|> Parser b = Parser (\s -> either (\_ -> b s) Right (a s)) -- | Backtracks. instance MonadPlus Parser where {-# INLINE mzero #-} mzero = empty {-# INLINE mplus #-} mplus = (<|>) -------------------------------------------------------------------------------- -- Some parsers -- | A 'Parser' that always fails with the given error message. pFail :: String -> Parser a pFail e = Parser (\_ -> Left e) -- | @'pElement' "foo" p@ runs a 'Parser' @p@ inside a element node named -- @"foo"@. This fails if such element does not exist at the current position. -- -- Leading whitespace is ignored. If you need to preserve that whitespace for -- some reason, capture it using 'pText' before using 'pElement'. -- -- Consumes the element from the parser state. pElement :: T.Text -> Parser a -> Parser a {-# INLINABLE pElement #-} pElement t0 p0 = Parser $ \case SReg t1 as0 (Element' t as cs : cs0) | t == t0 -> do (a,_) <- unParser p0 (SReg t as cs) Right (a, SReg t1 as0 cs0) STop (Element' t as cs : cs0) | t == t0 -> do (a,_) <- unParser p0 (SReg t as cs) Right (a, STop cs0) -- skip leading whitespace SReg t as (Text' x : cs) | TL.all Char.isSpace x -> unParser (pElement t0 p0) (SReg t as cs) STop (Text' x : cs) | TL.all Char.isSpace x -> unParser (pElement t0 p0) (STop cs) _ -> Left ("Missing element " ++ show t0) -- | @'pAnyElement' p@ runs a 'Parser' @p@ inside the element node at the -- current position, if any. Otherwise, if no such element exists, this parser -- fails. -- -- You can recover the name of the matched element using 'pName' inside the -- given 'Parser'. However, if you already know beforehand the name of the -- element that you want to match, it's better to use 'pElement' rather than -- 'pAnyElement'. -- -- Leading whitespace is ignored. If you need to preserve that whitespace for -- some reason, capture it using 'pText' before using 'pAnyElement'. -- -- Consumes the element from the parser state. pAnyElement :: Parser a -> Parser a {-# INLINABLE pAnyElement #-} pAnyElement p0 = Parser $ \case SReg t0 as0 (Element' t as cs : cs0) -> do (a,_) <- unParser p0 (SReg t as cs) Right (a, SReg t0 as0 cs0) STop (Element' t as cs : cs0) -> do (a,_) <- unParser p0 (SReg t as cs) Right (a, STop cs0) -- skip leading whitespace SReg t as (Text' x : cs) | TL.all Char.isSpace x -> unParser (pAnyElement p0) (SReg t as cs) STop (Text' x : cs) | TL.all Char.isSpace x -> unParser (pAnyElement p0) (STop cs) _ -> Left "Missing element" -- | Returns the name of the currently selected element. -- -- This parser fails if there's no currently selected element. -- -- Doesn't modify the parser state. pName :: Parser T.Text {-# INLINABLE pName #-} pName = Parser $ \case SReg t as cs -> Right (t, SReg t as cs) STop _ -> Left "Before selecting an name, you must select an element" -- | Return the value of the requested attribute, if defined. May return an -- empty string in case the attribute is defined but no value was given to it. -- -- This parser fails if there's no currently selected element. -- -- Consumes the attribute from the parser state. pAttr :: T.Text -> Parser T.Text {-# INLINABLE pAttr #-} pAttr n = Parser $ \case STop _ -> Left "Before selecting an attribute, you must select an element" SReg t as cs -> case HM.lookup n as of Just x -> Right (x, SReg t (HM.delete n as) cs) Nothing -> Left ("Missing attribute " ++ show n) -- | Returns all of the available element attributes. May return empty strings -- as values in case an attribute is defined but no value was given to it. -- -- This parser fails if there's no currently selected element. -- -- Consumes all of the remaining attributes for this element from the parser -- state. pAttrs :: Parser (HM.HashMap T.Text T.Text) {-# INLINABLE pAttrs #-} pAttrs = Parser $ \case STop _ -> Left "Before selecting an attribute, you must select an element" SReg t as cs -> Right (as, SReg t mempty cs) -- | Returns all of the immediate children of the current element. -- -- If parsing top-level nodes rather than a particular element (that is, if -- 'pChildren' is /not/ being run inside 'pElement'), then all of the top level -- 'Node's will be returned. -- -- Consumes all of the returned nodes from the parser state. pChildren :: Parser [Node] {-# INLINABLE pChildren #-} pChildren = Parser $ \case STop cs -> Right (cs, STop mempty) SReg t as cs -> Right (cs, SReg t as mempty) -- | Return a text node value. -- -- Surrounidng whitespace is not removed, as it is considered to be part of the -- text node. -- -- If there is no text node at the current position, then this parser fails. -- This implies that 'pText' /never/ returns an empty 'T.Text', since there is -- no such thing as a text node without text. -- -- Please note that consecutive text nodes are always concatenated and returned -- together. -- -- @ -- 'runParser' 'pText' ('text' \"Ha\" <> 'text' \"sk\" <> 'text' \"ell\") -- == 'Right' ('text' "Haskell") -- @ -- -- The returned text is consumed from the parser state. This implies that if you -- perform two consecutive 'pText' calls, the second will always fail. -- -- @ -- 'runParser' ('pText' >> 'pText') ('text' \"Ha\" <> 'text' \"sk\" <> 'text' \"ell\") -- == 'Left' "Missing text node" -- @ pText :: Parser TL.Text {-# INLINABLE pText #-} pText = Parser $ \case -- Note: this works only because we asume 'normalize' has been used. STop (Text x : ns) -> Right (x, STop ns) SReg t as (Text x : cs) -> Right (x, SReg t as cs) _ -> Left "Missing text node" -- | Succeeds if all of the elements, attributes and text nodes have -- been consumed. pEndOfInput :: Parser () {-# INLINABLE pEndOfInput #-} pEndOfInput = Parser (\s -> if isEof s then Right ((), s) else Left "Not end of input yet") isEof :: S -> Bool {-# INLINE isEof #-} isEof = \case SReg _ as cs -> HM.null as && null cs STop ns -> null ns -------------------------------------------------------------------------------- -- Rendering class ToXml a where -- | Renders a value of type @a@ into an XML fragment body. -- -- If a 'FromXml' instance for @a@ exists, then: -- -- @ -- 'runParser' 'fromXml' ('toXml' a) == 'Right' a -- @ toXml :: a -> [Node] -- | Encodes a list of XML 'Node's, representing an XML fragment body, to an -- UTF8-encoded and XML-escaped bytestring. -- -- This function doesn't render self-closing elements. Instead, all -- elements have a corresponding closing tag. -- -- Also, it doesn't render CDATA sections. Instead, all text is escaped as -- necessary. encode :: [Node] -> BB.Builder encode xs = mconcat (map encodeNode xs) where encodeNode :: Node -> BB.Builder encodeNode = \case Text x -> encodeXmlUtf8Lazy x Element t as cs -> -- This ugly code is so that we make sure we always bind concatenation -- to the right with as little effort as possible, using (<>). "<" <> encodeUtf8 t <> encodeAttrs (">" <> encode cs <> "</" <> encodeUtf8 t <> ">") as encodeAttrs :: BB.Builder -> HM.HashMap T.Text T.Text -> BB.Builder encodeAttrs = HM.foldlWithKey' (\o k v -> " " <> encodeUtf8 k <> "=\"" <> encodeXmlUtf8 v <> "\"" <> o) {-# INLINE encodeNode #-} {-# INLINE encodeAttrs #-} -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- -- Node fixpoint -- | Post-order depth-first replacement of 'Node' and all of its children. -- -- This function works like 'Data.Function.fix', but the given function is -- trying to find a fixpoint for the individual children nodes, not for the root -- node. -- -- For example, the following function renames every node named @"w"@ to @"y"@, -- and every node named @"y"@ to @"z"@. It accomplishes this by first renaming -- @"w"@ nodes to @"x"@, and then, by using @k@ recursively to further rename -- all @"x"@ nodes (including the ones that were just created) to @"y"@ in a -- post-order depth-first manner. After renaming an @"x"@ node to @"y"@, the -- recursion stops (i.e., @k@ is not used), so our new @"y"@ nodes won't be -- further renamed to @"z"@. However, nodes that were named @"y"@ initially will -- be renamed to @"z"@. -- -- In our example we only replace one node with another, but a node can be -- replaced with zero or more nodes, depending on the length of the resulting -- list. -- -- @ -- foo :: 'Node' -> ['Node'] -- foo = 'dfpos' $ \\k -> \\case -- 'Element' "w" as cs -> 'element'' "x" as cs >>= k -- 'Element' "x" as cs -> 'element'' "y" as cs -- 'Element' "y" as cs -> 'element'' "z" as cs >>= k -- @ -- -- See 'dfpre' for pre-orderd depth-first replacement. -- -- /WARNING/ If you call @k@ in every branch, then 'dfpos' will never terminate. -- Make sure the recursion stops at some point by simply returning a list of -- nodes instead of calling @k@. dfpos :: ((Node -> [Node]) -> Node -> [Node]) -> Node -> [Node] dfpos f = runIdentity . dfposM (\k -> Identity . f (runIdentity . k)) -- | Monadic version of 'dfpos'. dfposM :: Monad m => ((Node -> m [Node]) -> Node -> m [Node]) -> Node -> m [Node] dfposM f = \n0 -> do c1 <- traverseChildren (dfposM f) (cursorFromNode n0) c2 <- traverseRightSiblings (dfposM f) c1 fmap (normalize . join) (traverse (f (dfposM f)) (cursorSiblings c2)) -- | Pre-order depth-first replacement of 'Node' and all of its children. -- -- This is just like 'dfpos' but the search proceeds in a different order. dfpre :: ((Node -> [Node]) -> Node -> [Node]) -> Node -> [Node] dfpre f = runIdentity . dfpreM (\k -> Identity . f (runIdentity . k)) -- | Monadic version of 'dfpre'. dfpreM :: Monad m => ((Node -> m [Node]) -> Node -> m [Node]) -> Node -> m [Node] dfpreM f = \n0 -> do ns <- f (dfpreM f) n0 fmap (normalize . join) $ for ns $ \n -> do c1 <- traverseChildren (dfpreM f) (cursorFromNode n) cursorSiblings <$> traverseRightSiblings (dfpreM f) c1 -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- -- INTERNAL: Cursor -- -- Most of this comes from Chris Smith's xmlhtml, BSD3 licensed -- https://hackage.haskell.org/package/xmlhtml -- | Zipper into a 'Node' tree. data Cursor = Cursor { _cursorCurrent :: !Node -- ^ Retrieves the current node of a 'Cursor'. , _cursorLefts :: !(Seq Node) -- ^ Nodes to the left (ordered right to left). , _cursorRights :: !(Seq Node) -- ^ Nodes to the right (ordered left to right). , _cursorParents :: !(Seq (Seq Node, T.Text, HM.HashMap T.Text T.Text, Seq Node)) -- ^ Parents' name, attributes, and siblings. } ------------------------------------------------------------------------------ -- | The cursor if left where it starts. traverseChildren :: Monad m => (Node -> m [Node]) -> Cursor -> m Cursor {-# INLINE traverseChildren #-} traverseChildren f c0 = case _cursorCurrent c0 of Text _ -> pure c0 Element t as cs -> do n1s <- fmap (normalize . join) (traverse f cs) pure (c0 {_cursorCurrent = Element' t as n1s}) -- | The cursor if left in the rightmost sibling. traverseRightSiblings :: Monad m => (Node -> m [Node]) -> Cursor -> m Cursor {-# INLINE traverseRightSiblings #-} traverseRightSiblings f c0 = case cursorRemoveRight c0 of Nothing -> pure c0 Just (n1, c1) -> do n2s <- fmap normalize (f n1) traverseRightSiblings f (cursorInsertManyRight n2s c1) -- | Builds a 'Cursor' for navigating a tree. That is, a forest with a single -- root 'Node'. cursorFromNode :: Node -> Cursor {-# INLINE cursorFromNode #-} cursorFromNode n = Cursor n mempty mempty mempty -- | Retrieves a list of the 'Node's at the same level as the current position -- of a cursor, including the current node. cursorSiblings :: Cursor -> [Node] {-# INLINE cursorSiblings #-} cursorSiblings (Cursor cur ls rs _) = toList (Seq.reverse ls <> (cur Seq.<| rs)) -- | Removes the node to the right and return it. cursorRemoveRight :: Cursor -> Maybe (Node, Cursor) {-# INLINE cursorRemoveRight #-} cursorRemoveRight = \case Cursor n ls rs0 ps | not (Seq.null rs0) -> case Seq.viewl rs0 of r Seq.:< rs -> Just (r, Cursor n ls rs ps) _ -> undefined -- unreachable, rs0 is not empty _ -> Nothing -- | Inserts a list of new 'Node's to the right of the current position. cursorInsertManyRight :: [Node] -> Cursor -> Cursor {-# INLINE cursorInsertManyRight #-} cursorInsertManyRight ns (Cursor nn ls rs ps) = Cursor nn ls (Seq.fromList ns <> rs) ps -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- -- Miscellaneous encodeUtf8 :: T.Text -> BB.Builder {-# INLINE encodeUtf8 #-} encodeUtf8 = T.encodeUtf8Builder encodeXmlUtf8 :: T.Text -> BB.Builder {-# INLINE encodeXmlUtf8 #-} encodeXmlUtf8 = T.encodeUtf8BuilderEscaped xmlEscaped encodeXmlUtf8Lazy :: TL.Text -> BB.Builder {-# INLINE encodeXmlUtf8Lazy #-} encodeXmlUtf8Lazy = TL.encodeUtf8BuilderEscaped xmlEscaped xmlEscaped :: BBP.BoundedPrim Word8 {-# INLINE xmlEscaped #-} xmlEscaped = BBP.condB (== 38) (fixed5 (38,(97,(109,(112,59))))) $ -- '&' -> "&" BBP.condB (== 60) (fixed4 (38,(108,(116,59)))) $ -- '<' -> "<" BBP.condB (== 62) (fixed4 (38,(103,(116,59)))) $ -- '>' -> ">" BBP.condB (== 34) (fixed5 (38,(35,(51,(52,59))))) $ -- '"' -> """ BBP.liftFixedToBounded BBP.word8 where {-# INLINE fixed4 #-} fixed4 :: (Word8, (Word8, (Word8, Word8))) -> BBP.BoundedPrim Word8 fixed4 x = BBP.liftFixedToBounded (const x BBP.>$< BBP.word8 BBP.>*< BBP.word8 BBP.>*< BBP.word8 BBP.>*< BBP.word8) {-# INLINE fixed5 #-} fixed5 :: (Word8, (Word8, (Word8, (Word8, Word8)))) -> BBP.BoundedPrim Word8 fixed5 x = BBP.liftFixedToBounded (const x BBP.>$< BBP.word8 BBP.>*< BBP.word8 BBP.>*< BBP.word8 BBP.>*< BBP.word8 BBP.>*< BBP.word8)