src/Text/XML/DOM/Parser/Combinators.hs

module Text.XML.DOM.Parser.Combinators
  ( -- * Generic combinators to traverse descendants
    traverseElems
  , inFilteredTrav
    -- * Using 'DomTraversable'
  , inElemTrav
  , inElem
  , inElemAll
  , inElemMay
  , inElemNe
    -- * Dive combinators
  , divePath
  , diveElem
    -- * Explicit ignoring elements
  , ignoreElem
  , ignoreEmpty
  , ignoreBlank
    -- * Getting current element's properties
  , getCurrentName
  , getCurrentContent
  , getCurrentAttributes
  , getCurrentAttribute
    -- * Current element's checks
  , checkCurrentName
    -- * Parsing element's content
  , parseContent
  , readContent
  , maybeReadContent
    -- * Parsing attributes
  , parseAttribute
  ) where

import           Control.Lens
import           Control.Monad.Except
import           Control.Monad.Reader
import           Data.List.NonEmpty (NonEmpty(..))
import qualified Data.Map.Strict as M
import           Data.Monoid
import           Data.Text (Text)
import qualified Data.Text as T
import           Data.Traversable
import           Data.Typeable
import           Text.Read
import           Text.XML
import           Text.XML.DOM.Parser.Types
import           Text.XML.Lens


-- | Generic function to traverse arbitrary inner cursors.
traverseElems
  :: (Monad m, Foldable g, Traversable f)
  => ([Element] -> DomParserT g m (f (DomPath, Element)))
     -- ^ Takes list of current elements and returns container with
     -- pairs of subpath (relatively to current element) and element
     -- to run parser in
  -> DomParserT Identity m a
     -- ^ Parser will be runned for each element found in traversable
  -> DomParserT g m (f a)
traverseElems trav parser = do
  pd <- ask
  inner <- trav $ pd ^.. pdElements . folded
  for inner $ \(subpath, e) -> do
    let newpd = ParserData
          { _pdElements = Identity e
          , _pdPath     = pd ^. pdPath <> subpath }
    magnify (to $ const newpd) parser -- type of reader is changed, so
                                      -- local does not work

-- | Takes function filtering
inFilteredTrav
  :: (Monad m, Foldable g, DomTraversable f)
  => ([Element] -> (DomPath, [Element]))
   -- ^ Function returning some filtered elements with path suffixes which will
   -- be appended to parser's state
  -> DomParserT Identity m a
  -> DomParserT g m (f a)
inFilteredTrav deeper = traverseElems trav
  where
    trav e = do
      let (path, elems) = deeper e
      case buildDomTraversable elems of
        Nothing -> throwParserError $ PENotFound . (<> path)
        Just tr -> return $ fmap (path,) tr

inElemTrav
  :: (Monad m, Foldable g, DomTraversable f)
  => Text
  -> DomParserT Identity m a
  -> DomParserT g m (f a)
inElemTrav n = inFilteredTrav deeper
  where
    deeper = ([n],) . toListOf (folded . nodes . folded . _Element . ell n)

-- | Runs parser inside first children element with given name
inElem
  :: (Monad m, Foldable g)
  => Text
  -> DomParserT Identity m a
  -> DomParserT g m a
inElem n = fmap runIdentity . inElemTrav n

inElemAll
  :: (Monad m, Foldable g)
  => Text
  -> DomParserT Identity m a
  -> DomParserT g m [a]
inElemAll = inElemTrav

inElemMay
  :: (Monad m, Foldable g)
  => Text
  -> DomParserT Identity m a
  -> DomParserT g m (Maybe a)
inElemMay = inElemTrav

inElemNe
  :: (Monad m, Foldable g)
  => Text
  -> DomParserT Identity m a
  -> DomParserT g m (NonEmpty a)
inElemNe = inElemTrav

{- | Dive given parser's current tags set into the given path. The @divePath
["a", "b"]@ differs from @inElem "a" . inElem "b"@. Namely the first variant
will not fail if occured tag "a" which does not contains tag "b". This
behaviour is desireable when you dont want to parse whole XML and just want
to pull tags in some path. The other difference is in traversing inner
elements. Consider this code

@
inElem "a" $ inElem "b" $ inElemAll "c" fromDom
@

which translates to pseudo-CSS query like: @a:nth(1) > b:nth(1) > c > fromDom@

@
divePath ["a", "b"] $ inElemAll "c" fromDom
@

which translates like: @a > b > c > fromDom@

As you can see, inElem always takes first element and runs inner parser in this
single element, unlike 'divePath' which runs inner parser @in all@ descendants
in given path.
-}

divePath
  :: forall m g a
   . (Monad m, Foldable g)
  => [Text]
  -> DomParserT [] m a
  -> DomParserT g m a
divePath path = magnify $ to modElems
  where
    modElems
      = over pdElements (toListOf $ folded . diver)
      . over pdPath (<> path)
    diver :: Fold Element Element
    diver    = foldr (.) id $ map toDive path
    toDive n = nodes . folded . _Element . ell n

diveElem
  :: (Monad m, Foldable g)
  => Text
  -> DomParserT [] m a
  -> DomParserT g m a
diveElem p = divePath [p]

-- | Ignore arbitrary current element if it conforms to predicate.
ignoreElem
  :: (Monad m)
  => (Element -> Bool)
     -- ^ Predicate checking that we must ignore some current tag. If returns
     -- 'True' then parser will not be runned and combinator just returns Nothing.
  -> DomParserT Identity m a
  -> DomParserT Identity m (Maybe a)
ignoreElem test parser = do
  ign <- view $ pdElements . to (test . runIdentity)
  if ign then pure Nothing else Just <$> parser

-- | If current element has no children nodes does not run parser and returns
-- Nothing. Otherwise runs parser inside current element. Usefull when you got
-- XML with strange empty elements which must be just ignored, but `inElem` runs
-- parser inside of this elements which causes to parser error.
ignoreEmpty
  :: (Monad m)
  => DomParserT Identity m a
  -> DomParserT Identity m (Maybe a)
ignoreEmpty = ignoreElem test
  where
    test e = null $ e ^. nodes

-- | If all current elements contains blank content, or contains nothing at all
-- , then returns Nothing, else runs parser.
ignoreBlank
  :: (Monad m)
  => DomParserT Identity m a
  -> DomParserT Identity m (Maybe a)
ignoreBlank = ignoreElem test
  where
    test e =
      let
        elems = e ^.. nodes . folded . _Element
        cont = mconcat $ e ^.. nodes . folded . _Content
      in if | not $ null elems      -> False
            | T.null $ T.strip cont -> True
            | otherwise             -> False

-- | Returns name of current element.
--
-- @since 1.0.0
getCurrentName :: (Monad m) => DomParserT Identity m Text
getCurrentName = view $ pdElements . to runIdentity . localName

-- | If name of current tag differs from first argument throws 'PENotFound' with
-- tag name replaced in last path's segment. Usefull for checking root
-- document's element name.
checkCurrentName
  :: (Monad m)
  => Text
  -> DomParserT Identity m ()
checkCurrentName n = do
  cn <- getCurrentName
  unless (cn == n) $ do
    p <- view pdPath
    let pinit = if null p then [] else init p
    throwError $ ParserErrors [PENotFound $ pinit ++ [n]]
  return ()

-- | Get current content. If current element contains no content or
-- have inner elements then Nothing returned
--
-- @since 1.0.0
getCurrentContent :: (Monad m) => DomParserT Identity m (Maybe Text)
getCurrentContent = do
  nds <- view $ pdElements . to runIdentity . nodes
  let
    els :: [Element]
    els = nds ^.. folded . _Element
    conts :: [Text]
    conts = nds ^.. folded . _Content
  return $ if
    | not $ null els -> Nothing
    | null conts     -> Nothing
    | otherwise      -> Just $ mconcat conts

-- | Parses content inside current tag. It expects current element set
-- consists of exactly ONE element. If current element does not
-- contains content or have other elements as childs then throws error
parseContent
  :: (Monad m)
  => (Text -> Either Text a)
     -- ^ Content parser, return error msg if value is not parsed
  -> DomParserT Identity m a
parseContent parse = getCurrentContent >>= \case
  Nothing -> throwParserError PEContentNotFound
  Just c  -> case parse c of
    Left e  -> throwParserError $ PEWrongFormat e
    Right a -> return a

-- | If reader returns 'Nothing' then resulting function returns 'Left
-- "error message"'
--
-- @since 1.0.0
maybeReadContent
  :: forall a
   . (Typeable a)
  => (Text -> Maybe a)
   -- ^ Content or attribute reader
  -> Text
   -- ^ Content or attribute value
  -> Either Text a
maybeReadContent f t = maybe (Left msg) Right $ f t
  where
    msg = "Not readable " <> n <> ": " <> t
    n = T.pack $ show $ typeRep (Proxy :: Proxy a)

-- | Tries to read given text to value using 'Read'. Usefull to use
-- with 'parseContent' and 'parseAttribute'
readContent
  :: (Read a, Typeable a)
  => Text
  -> Either Text a
readContent = maybeReadContent $ readMaybe . T.unpack . T.strip

-- | Retuns map of attributes of current element
--
-- @since 1.0.0
getCurrentAttributes :: (Monad m) => DomParserT Identity m (M.Map Name Text)
getCurrentAttributes = view $ pdElements . to runIdentity . attrs

-- | Returns element with given name or 'Nothing'
--
-- @since 1.0.0
getCurrentAttribute :: (Monad m) => Text -> DomParserT Identity m (Maybe Text)
getCurrentAttribute attrName'
  = preview $ pdElements . to runIdentity . attr attrName
  where
    attrName = Name attrName' Nothing Nothing

-- | Parses attribute with given name, throws error if attribute is not found.
--
-- @since 1.0.0
parseAttribute
  :: (Monad m)
  => Text
     -- ^ Attribute name
  -> (Text -> Either Text a)
     -- ^ Attribute content parser
  -> DomParserT Identity m a
parseAttribute attrName parser = getCurrentAttribute attrName >>= \case
  Nothing   -> throwParserError $ PEAttributeNotFound attrName
  Just aval -> case parser aval of
    Left err -> throwParserError $ PEAttributeWrongFormat attrName err
    Right a  -> return a