{-# LANGUAGE CPP #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}

{-
  Files Xmlbf and Xeno have been taken from:
  https://gitlab.com/k0001/xmlbf

  Which is licensed under Apache License 2.0.

  Read the comments in the Xmlbf.hs file for the reason why.
-}

module Text.RDF.RDF4H.XmlParser.Xeno
 ( fromXenoNode
 , fromRawXml
 ) where

import qualified Data.Bifunctor as Bif
import qualified Data.ByteString as B
import qualified Data.HashMap.Strict as HM
#if !MIN_VERSION_base(4,13,0)
import Data.Monoid ((<>))
#else
#endif
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Builder as TB
import qualified Data.Text.Encoding as T
import Data.Traversable (for)
import qualified HTMLEntities.Decoder
import qualified Xeno.DOM as Xeno
import qualified Text.RDF.RDF4H.XmlParser.Xmlbf as Xmlbf

--------------------------------------------------------------------------------
-- Xeno support

-- | Convert a 'Xeno.Node' from "Xeno.DOM" into an 'Element' from "Xmlbf".
fromXenoNode
  :: Xeno.Node -- ^ A 'Xeno.Node' from "Xeno.DOM".
  -> Either String Xmlbf.Node -- ^ A 'Xmlbf.Node' from "Xmlbf".
fromXenoNode :: Node -> Either String Node
fromXenoNode Node
x = do
  Text
n <- ByteString -> Either String Text
decodeUtf8 (Node -> ByteString
Xeno.name Node
x)
  [(Text, Text)]
as <- [(ByteString, ByteString)]
-> ((ByteString, ByteString) -> Either String (Text, Text))
-> Either String [(Text, Text)]
forall (t :: * -> *) (f :: * -> *) a b.
(Traversable t, Applicative f) =>
t a -> (a -> f b) -> f (t b)
for (Node -> [(ByteString, ByteString)]
Xeno.attributes Node
x) (((ByteString, ByteString) -> Either String (Text, Text))
 -> Either String [(Text, Text)])
-> ((ByteString, ByteString) -> Either String (Text, Text))
-> Either String [(Text, Text)]
forall a b. (a -> b) -> a -> b
$ \(ByteString
k,ByteString
v) -> do
     (,) (Text -> Text -> (Text, Text))
-> Either String Text -> Either String (Text -> (Text, Text))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ByteString -> Either String Text
decodeUtf8 ByteString
k Either String (Text -> (Text, Text))
-> Either String Text -> Either String (Text, Text)
forall a b.
Either String (a -> b) -> Either String a -> Either String b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> ByteString -> Either String Text
unescapeXmlUtf8 ByteString
v
  [Node]
cs <- [Content]
-> (Content -> Either String Node) -> Either String [Node]
forall (t :: * -> *) (f :: * -> *) a b.
(Traversable t, Applicative f) =>
t a -> (a -> f b) -> f (t b)
for (Node -> [Content]
Xeno.contents Node
x) ((Content -> Either String Node) -> Either String [Node])
-> (Content -> Either String Node) -> Either String [Node]
forall a b. (a -> b) -> a -> b
$ \case
     Xeno.Element Node
n1 -> Node -> Either String Node
fromXenoNode Node
n1
     Xeno.Text ByteString
bs -> Text -> Either String Node
Xmlbf.text' (Text -> Either String Node)
-> Either String Text -> Either String Node
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< ByteString -> Either String Text
unescapeXmlUtf8Lazy ByteString
bs
     Xeno.CData ByteString
bs -> Text -> Either String Node
Xmlbf.text' (Text -> Either String Node)
-> Either String Text -> Either String Node
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< ByteString -> Either String Text
decodeUtf8Lazy ByteString
bs
  Text -> HashMap Text Text -> [Node] -> Either String Node
Xmlbf.element' Text
n ([(Text, Text)] -> HashMap Text Text
forall k v. (Eq k, Hashable k) => [(k, v)] -> HashMap k v
HM.fromList [(Text, Text)]
as) [Node]
cs

-- | Parses a given UTF8-encoded raw XML fragment into @a@, using the @xeno@
-- Haskell library, so all of @xeno@'s parsing quirks apply.
--
-- You can provide the output of this function as input to "Xmlbf"'s
-- 'Xmlbf.parse'.
--
-- The given XML can contain more zero or more text or element nodes.
--
-- Surrounding whitespace is not stripped.
fromRawXml
  :: B.ByteString                 -- ^ Raw XML fragment.
  -> Either String [Xmlbf.Node]   -- ^ 'Xmlbf.Node's from "Xmlbf"
fromRawXml :: ByteString -> Either String [Node]
fromRawXml = \ByteString
bs -> case ByteString -> Either XenoException Node
Xeno.parse (ByteString
"<x>" ByteString -> ByteString -> ByteString
forall a. Semigroup a => a -> a -> a
<> ByteString -> ByteString
dropBomUtf8 ByteString
bs ByteString -> ByteString -> ByteString
forall a. Semigroup a => a -> a -> a
<> ByteString
"</x>") of
  Left XenoException
e -> String -> Either String [Node]
forall a b. a -> Either a b
Left (String
"Malformed XML: " String -> String -> String
forall a. [a] -> [a] -> [a]
++ XenoException -> String
forall a. Show a => a -> String
show XenoException
e)
  Right Node
n -> Node -> Either String Node
fromXenoNode Node
n Either String Node
-> (Node -> Either String [Node]) -> Either String [Node]
forall a b.
Either String a -> (a -> Either String b) -> Either String b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \case
    Xmlbf.Element Text
"x" HashMap Text Text
_ [Node]
cs -> [Node] -> Either String [Node]
forall a b. b -> Either a b
Right [Node]
cs
    Node
_ -> String -> Either String [Node]
forall a b. a -> Either a b
Left String
"Unknown result from fromXenoNode. Please report this as a bug."

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
-- Miscellaneous

decodeUtf8 :: B.ByteString -> Either String T.Text
{-# INLINE decodeUtf8 #-}
decodeUtf8 :: ByteString -> Either String Text
decodeUtf8 ByteString
bs = (UnicodeException -> String)
-> Either UnicodeException Text -> Either String Text
forall a b c. (a -> b) -> Either a c -> Either b c
forall (p :: * -> * -> *) a b c.
Bifunctor p =>
(a -> b) -> p a c -> p b c
Bif.first UnicodeException -> String
forall a. Show a => a -> String
show (ByteString -> Either UnicodeException Text
T.decodeUtf8' ByteString
bs)

decodeUtf8Lazy :: B.ByteString -> Either String TL.Text
{-# INLINE decodeUtf8Lazy #-}
decodeUtf8Lazy :: ByteString -> Either String Text
decodeUtf8Lazy ByteString
bs = (Text -> Text) -> Either String Text -> Either String Text
forall a b. (a -> b) -> Either String a -> Either String b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Text -> Text
TL.fromStrict (ByteString -> Either String Text
decodeUtf8 ByteString
bs)

unescapeXmlUtf8 :: B.ByteString -> Either String T.Text
{-# INLINE unescapeXmlUtf8 #-}
unescapeXmlUtf8 :: ByteString -> Either String Text
unescapeXmlUtf8 ByteString
bs = (Text -> Text) -> Either String Text -> Either String Text
forall a b. (a -> b) -> Either String a -> Either String b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Text -> Text
TL.toStrict (ByteString -> Either String Text
unescapeXmlUtf8Lazy ByteString
bs)

unescapeXmlUtf8Lazy :: B.ByteString -> Either String TL.Text
{-# INLINE unescapeXmlUtf8Lazy #-}
unescapeXmlUtf8Lazy :: ByteString -> Either String Text
unescapeXmlUtf8Lazy ByteString
bs = do
   Text
t <- ByteString -> Either String Text
decodeUtf8 ByteString
bs
   Text -> Either String Text
forall a. a -> Either String a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Builder -> Text
TB.toLazyText (Text -> Builder
HTMLEntities.Decoder.htmlEncodedText Text
t))

dropBomUtf8 :: B.ByteString -> B.ByteString
{-# INLINE dropBomUtf8 #-}
dropBomUtf8 :: ByteString -> ByteString
dropBomUtf8 ByteString
bs | ByteString -> ByteString -> Bool
B.isPrefixOf ByteString
"\xEF\xBB\xBF" ByteString
bs = Int -> ByteString -> ByteString
B.drop Int
3 ByteString
bs
               | Bool
otherwise = ByteString
bs