{-# LANGUAGE LambdaCase #-}

module Xmlbf.XmlHtml
 ( fromXmlHtmlNode
 , fromRawXml
 , fromRawHtml
 ) where

import qualified Data.ByteString as B
import qualified Data.HashMap.Strict as HM
import qualified Text.XmlHtml as XmlHtml
import qualified Xmlbf

--------------------------------------------------------------------------------
-- XmlHtml support

-- | Convert a 'XmlHtml.Node' from "Text.XmlHtml" into an 'Node' from "Xmlbf",
-- if possible.
fromXmlHtmlNode
  :: XmlHtml.Node -- ^ A 'XmlHtml.Node' from "Text.XmlHtml".
  -> Either String Xmlbf.Node -- ^ A 'Xmlbf.Node' from "Xmlbf", if possible.
fromXmlHtmlNode :: Node -> Either String Node
fromXmlHtmlNode = \case
  XmlHtml.Comment Text
_ -> forall a b. a -> Either a b
Left String
"Comments not supported"
  XmlHtml.TextNode Text
t -> Text -> Either String Node
Xmlbf.text' Text
t
  XmlHtml.Element Text
t [(Text, Text)]
as [Node]
cs -> do
    [Node]
cs' <- forall (t :: * -> *) (f :: * -> *) a b.
(Traversable t, Applicative f) =>
(a -> f b) -> t a -> f (t b)
traverse Node -> Either String Node
fromXmlHtmlNode [Node]
cs
    Text -> HashMap Text Text -> [Node] -> Either String Node
Xmlbf.element' Text
t (forall k v. (Eq k, Hashable k) => [(k, v)] -> HashMap k v
HM.fromList [(Text, Text)]
as) [Node]
cs'

-- | Parses a given UTF8-encoded raw XML fragment into @a@, using the @xmlhtml@
-- Haskell library, so all of @xmlhtml@'s parsing quirks apply.
--
-- You can provide the output of this function as input to "Xmlbf"'s
-- 'Xmlbf.parse'.
--
-- The given XML can contain more zero or more text or element nodes.
--
-- Comments are discarded from the resulting nodes and their children.
--
-- Surrounding whitespace is not stripped.
fromRawXml
  :: B.ByteString                 -- ^ Raw XML fragment.
  -> Either String [Xmlbf.Node]   -- ^ 'Xmlbf.Node's from "Xmlbf"
fromRawXml :: ByteString -> Either String [Node]
fromRawXml = \ByteString
bs -> case String -> ByteString -> Either String Document
XmlHtml.parseXML String
"xmlbf-xmlhtml-input.xml" ByteString
bs of
  Left String
e -> forall a b. a -> Either a b
Left (String
"Malformed XML: " forall a. [a] -> [a] -> [a]
++ String
e)
  Right Document
d -> forall (t :: * -> *) (f :: * -> *) a b.
(Traversable t, Applicative f) =>
(a -> f b) -> t a -> f (t b)
traverse Node -> Either String Node
fromXmlHtmlNode (Document -> [Node]
XmlHtml.docContent Document
d)

-- | Like 'fromRawXml', but parses using @xmlhtml@'s quirks HTML mode.
fromRawHtml
  :: B.ByteString                 -- ^ Raw HTML fragment.
  -> Either String [Xmlbf.Node]   -- ^ 'Xmlbf.Node's from "Xmlbf"
fromRawHtml :: ByteString -> Either String [Node]
fromRawHtml = \ByteString
bs -> case String -> ByteString -> Either String Document
XmlHtml.parseHTML String
"xmlbf-xmlhtml-input.html" ByteString
bs of
  Left String
e -> forall a b. a -> Either a b
Left (String
"Malformed HTML: " forall a. [a] -> [a] -> [a]
++ String
e)
  Right Document
d -> forall (t :: * -> *) (f :: * -> *) a b.
(Traversable t, Applicative f) =>
(a -> f b) -> t a -> f (t b)
traverse Node -> Either String Node
fromXmlHtmlNode (Document -> [Node]
XmlHtml.docContent Document
d)