-- | A parser for the Xtract command-language.  (The string input is
--   tokenised internally by the lexer 'lexXtract'.)
--   See <http://www.haskell.org/HaXml/Xtract.html> for the grammar that
--   is accepted.

--   Because the original Xtract grammar was left-recursive, we have
--   transformed it into a non-left-recursive form.
module Text.XML.HaXml.Xtract.Parse (parseXtract,xtract) where

import Text.ParserCombinators.Poly hiding (bracket)
import Text.XML.HaXml.Xtract.Lex
import Text.XML.HaXml.Xtract.Combinators as D
import Text.XML.HaXml.Combinators as C
import Text.XML.HaXml.Types (Content)
import Data.List(isPrefixOf)
import Text.XML.HaXml.Escape (xmlUnEscapeContent,stdXmlEscaper)

-- output transformer - to ensure that text/references are glued together
unescape :: [Content i] -> [Content i]
unescape = xmlUnEscapeContent stdXmlEscaper


-- | To convert an Xtract query into an ordinary HaXml combinator expression.
--   First arg is a tag-transformation function (e.g. map toLower) applied
---  before matching.  Second arg is the query string.
xtract :: (String->String) -> String -> CFilter i
xtract f query
    | interiorRef lexedQ = dfilter (parseXtract lexedQ)
    | otherwise          = cfilter (parseXtract lexedQ)
  where
    lexedQ = lexXtract f query
    -- test whether query has interior reference to doc root
    interiorRef (Right (_,Symbol s): Right (_,Symbol "//"): _)
                                          | s `elem` predicateIntro = True
    interiorRef (Right (_,Symbol s): Right (_,Symbol "/"): _)
                                          | s `elem` predicateIntro = True
    interiorRef (_ : rest) = interiorRef rest
    interiorRef [] = False
    predicateIntro = [ "[", "("
                     ,  "&",   "|",  "~"
                     ,  "=",  "!=",  "<",  "<=",  ">",  ">="
                     , ".=.",".!=.",".<.",".<=.",".>.",".>=." ]

-- | The cool thing is that the Xtract command parser directly builds
--   a higher-order 'DFilter' (see "Text.XML.HaXml.Xtract.Combinators")
--   which can be applied to an XML document without further ado.
--   (@parseXtract@ halts the program if a parse error is found.)
parseXtract :: [Token] -> DFilter i
parseXtract = either error id . parseXtract'

-- | @parseXtract'@ returns error messages through the Either type.
parseXtract' :: [Token] -> Either String (DFilter i)
parseXtract' = fst . runParser (aquery liftLocal)

---- Auxiliary Parsing Functions ----
type XParser a = Parser (Either String (Posn,TokenT)) a

string :: XParser String
string = P (\inp -> case inp of
                (Left err: _) -> Failure inp err
                (Right (_,TokString n):ts) -> Success ts n
                ts -> Failure ts "expected a string" )
number :: XParser Integer
number = P (\inp -> case inp of
                (Left err: _) -> Failure inp err
                (Right (_,TokNum n):ts) -> Success ts n
                ts -> Failure ts "expected a number" )
symbol :: String -> XParser ()
symbol s = P (\inp -> case inp of
                (Left err: _) -> Failure inp err
                (Right (_, Symbol n):ts) | n==s -> Success ts ()
                ts -> Failure ts ("expected symbol "++s) )

quote :: XParser ()
quote = oneOf [ symbol "'",  symbol "\"" ]

pam :: [a->b] -> a -> [b]
pam fs x = [ f x | f <- fs ]


{--- original Xtract grammar ----
      query     = string			tagname
                | string *			tagname prefix
                | * string			tagname suffix
                | *				any element
                | -				chardata
                | ( query )
                | query / query			parent/child relationship
                | query // query		deep inside
                | query + query			union of queries
                | query [predicate]
                | query [positions]

      predicate = quattr			has attribute
                | quattr op ' string '		attribute has value
                | quattr op " string "		attribute has value
                | quattr op  quattr		attribute value comparison (lexical)
                | quattr nop integer  		attribute has value (numerical)
                | quattr nop quattr		attribute value comparison (numerical)
                | ( predicate )			bracketting
                | predicate & predicate		logical and
                | predicate | predicate		logical or
                | ~ predicate			logical not

      attribute = @ string			has attribute
                | query / @ string		child has attribute
                | -				has textual content
                | query / -			child has textual content

      quattr    = query
                | attribute

      op        =  =				equal to
                |  !=				not equal to
                |  <				less than
                |  <=				less than or equal to
                |  >				greater than
                |  >=				greater than or equal to

      nop       =  .=.				equal to
                |  .!=.				not equal to
                |  .<.				less than
                |  .<=.				less than or equal to
                |  .>.				greater than
                |  .>=.				greater than or equal to

      positions = position {, positions}	multiple positions
                | position - position		ranges

      position  = integer			numbering is from 0 upwards
                | $				last


---- transformed grammar (removing left recursion)
      aquery = ./ tquery	-- current context
             | tquery		-- also current context
             | / tquery		-- root context
             | // tquery	-- deep context from root

      tquery = ( tquery ) xquery
             | tag xquery
             | -		-- fixes original grammar ("-/*" is incorrect)
      
      tag    = string *
             | string
             | * string
             | *
      
      xquery = / tquery
             | // tquery
             | / @ string	-- new: print attribute value
             | + tquery
             | [ tpredicate ] xquery
             | [ positions ] xquery
             | lambda

      tpredicate = vpredicate upredicate
      upredicate = & tpredicate
                 | | tpredicate
                 | lambda
      vpredicate = ( tpredicate )
                 | ~ tpredicate
                 | tattribute

      tattribute = aquery uattribute
                 | @ string vattribute
      uattribute = / @ string vattribute
                 | vattribute
      vattribute = op wattribute
                 | op ' string '
                 | nop wattribute
                 | nop integer
                 | lambda
      wattribute = @ string
                 | aquery / @ string
                 | aquery

      positions  = simplepos commapos
      simplepos  = integer range
                 | $
      range      = - integer
                 | - $
                 | lambda
      commapos   = , simplepos commapos
                 | lambda

      op         =  =
                 |  !=
                 |  <
                 |  <=
                 |  >
                 |  >=

      nop        =  .=.
                 |  .!=.
                 |  .<.
                 |  .<=.
                 |  .>.
                 |  .>=.
-}

bracket :: XParser a -> XParser a
bracket p =
  do symbol "("
     x <- p
     symbol ")"
     return x


---- Xtract parsers ----

-- aquery chooses to search from the root, or only in local context
aquery ::  ((CFilter i->CFilter i) -> (DFilter i->DFilter i))
           -> XParser (DFilter i)
aquery lift = oneOf
    [ do symbol "//"
         tquery [lift C.multi]
    , do symbol "/"
         tquery [lift id]
    , do symbol "./"
         tquery [(local C.keep D./>)]
    , do tquery [(local C.keep D./>)]
    ]

tquery :: [DFilter i->DFilter i] -> XParser (DFilter i)
tquery [] = tquery [id]
tquery (qf:cxt) = oneOf
    [ do q <- bracket (tquery (qf:qf:cxt))
         xquery cxt q
    , do q <- xtag
         xquery cxt (qf ((unescape .).q))       -- glue inners texts together
    , do symbol "-"
         return (qf (local C.txt))
    ]

xtag :: XParser (DFilter i)
xtag = oneOf
    [ do s <- string
         symbol "*"
         return (local (C.tagWith (s `isPrefixOf`)))
    , do s <- string
         return (local (C.tag s))
    , do symbol "*"
         s <- string
         return (local (C.tagWith (((reverse s) `isPrefixOf`) . reverse)))
    , do symbol "*"
         return (local C.elm)
    ]


xquery :: [DFilter i->DFilter i] -> DFilter i -> XParser (DFilter i)
xquery cxt q1 = oneOf
    [ do symbol "/"
         ( do symbol "@"
              attr <- string
              return (D.iffind attr (\s->local (C.literal s)) D.none `D.o` q1)
           `onFail`
           tquery ((q1 D./>):cxt) )
    , do symbol "//"
         tquery ((\q2-> (liftLocal C.multi) q2
                            `D.o` local C.children `D.o` q1):cxt)
    , do symbol "+"
         q2 <- tquery cxt
         return (D.cat [q1,q2])
    , do symbol "["
         is <- iindex   -- now extended to multiple indexes
         symbol "]"
         xquery cxt (\xml-> concat . pam is . q1 xml)
    , do symbol "["
         p <- tpredicate
         symbol "]"
         xquery cxt (q1 `D.with` p)
    , return q1
    ]

tpredicate :: XParser (DFilter i)
tpredicate =
  do p <- vpredicate
     f <- upredicate
     return (f p)

upredicate :: XParser (DFilter i->DFilter i)
upredicate = oneOf
    [ do symbol "&"
         p2 <- tpredicate
         return (`D.o` p2)
    , do symbol "|"
         p2 <- tpredicate
         return (D.|>| p2)
    , return id
    ]

vpredicate :: XParser (DFilter i)
vpredicate = oneOf
    [ do bracket tpredicate
    , do symbol "~"
         p <- tpredicate
         return (local C.keep `D.without` p)
    , do tattribute
    ]

tattribute :: XParser (DFilter i)
tattribute = oneOf
    [ do q <- aquery liftGlobal
         uattribute q
    , do symbol "@"
         s <- string
         vattribute (local C.keep, local (C.attr s), D.iffind s)
    ]

uattribute :: DFilter i -> XParser (DFilter i)
uattribute q = oneOf
    [ do symbol "/"
         symbol "@"
         s <- string
         vattribute (q, local (C.attr s), D.iffind s)
    , do vattribute (q, local C.keep,     D.ifTxt)
    ]

vattribute :: (DFilter i, DFilter i, (String->DFilter i)->DFilter i->DFilter i)
              -> XParser (DFilter i)
vattribute (q,a,iffn) = oneOf
  [ do cmp <- op
       quote
       s2 <- string
       quote
       return ((iffn (\s1->if cmp s1 s2 then D.keep else D.none) D.none)
               `D.o` q)
  , do cmp <- op
       (q2,iffn2) <- wattribute -- q2 unused?  is this a mistake?
       return ((iffn (\s1-> iffn2 (\s2-> if cmp s1 s2 then D.keep else D.none)
                                  D.none)
                     D.none) `D.o` q)
  , do cmp <- nop
       n <- number
       return ((iffn (\s->if cmp (read s) n then D.keep else D.none) D.none)
               `D.o` q)
  , do cmp <- nop
       (q2,iffn2) <- wattribute -- q2 unused?  is this a mistake?
       return ((iffn (\s1-> iffn2 (\s2-> if cmp (read s1) (read s2) then D.keep
                                                                    else D.none)
                                  D.none)
                     D.none) `D.o` q)
  , do return ((a `D.o` q))
  ]

wattribute :: XParser (DFilter i, (String->DFilter i)->DFilter i->DFilter i)
wattribute = oneOf
    [ do symbol "@"
         s <- string
         return (D.keep, D.iffind s)
    , do q <- aquery liftGlobal
         symbol "/"
         symbol "@"
         s <- string
         return (q, D.iffind s)
    , do q <- aquery liftGlobal
         return (q, D.ifTxt)
    ]


iindex :: XParser [[a]->[a]]
iindex =
    do i <- simpleindex
       is <- idxcomma
       return (i:is)

simpleindex :: XParser ([a]->[a])
simpleindex = oneOf
    [ do n <- number
         r <- rrange n
         return r
    , do symbol "$"
         return (C.keep . last)
    ]

rrange, numberdollar :: Integer -> XParser ([a]->[a])
rrange n1 = oneOf
    [ do symbol "-"
         numberdollar n1
    , return (take 1 . drop (fromInteger n1))
    ]

numberdollar n1 = oneOf
    [ do n2 <- number
         return (take (fromInteger (1+n2-n1)) . drop (fromInteger n1))
    , do symbol "$"
         return (drop (fromInteger n1))
    ]

idxcomma :: XParser [[a]->[a]]
idxcomma = oneOf
    [ do symbol ","
         r <- simpleindex
         rs <- idxcomma
         return (r:rs)
    , return []
    ]


op :: XParser (String->String->Bool)
op = oneOf
    [ do symbol "=";  return (==)
    , do symbol "!="; return (/=)
    , do symbol "<";  return (<)
    , do symbol "<="; return (<=)
    , do symbol ">";  return (>)
    , do symbol ">="; return (>=)
    ]

nop :: XParser (Integer->Integer->Bool)
nop = oneOf
    [ do symbol ".=.";  return (==)
    , do symbol ".!=."; return (/=)
    , do symbol ".<.";  return (<)
    , do symbol ".<=."; return (<=)
    , do symbol ".>.";  return (>)
    , do symbol ".>=."; return (>=)
    ]