-- ------------------------------------------------------------

{- |
   Module     : Text.XML.HXT.Parser.XmlTokenParser
   Copyright  : Copyright (C) 2010 Uwe Schmidt
   License    : MIT

   Maintainer : Uwe Schmidt (uwe@fh-wedel.de)
   Stability  : stable
   Portability: portable

   Parsec parser for XML tokens

-}

-- ------------------------------------------------------------

module Text.XML.HXT.Parser.XmlTokenParser
    ( allBut
    , allBut1
    , amp
    , asciiLetter
    , attrChar
    , attrValue
    , bar
    , charRef
    , checkString
    , comma
    , dq
    , encName
    , entityRef
    , entityValue
    , eq
    , gt
    , keyword
    , keywords
    , lpar
    , lt
    , name
    , names
    , ncName
    , nmtoken
    , nmtokens
    , peReference
    , pubidLiteral
    , qName
    , quoted
    , reference
    , rpar
    , semi
    , separator
    , singleChar
    , singleChars
    , skipS
    , skipS0
    , sPace
    , sPace0
    , sq
    , systemLiteral
    , versionNum

    , concRes
    , mkList

    , nameT
    , nmtokenT

    , entityValueT
    , entityTokensT
    , entityCharT

    , attrValueT
    , attrValueT'

    , referenceT
    , charRefT
    , entityRefT
    , peReferenceT

    , singleCharsT

    , mergeTextNodes
    )
where

import Control.Applicative                      ( (<$>) )

import Data.Char.Properties.XMLCharProps        ( isXmlChar
                                                , isXmlCharCR
                                                )
import Data.String.Unicode                      ( intToCharRef
                                                , intToCharRefHex
                                                )

import Text.ParserCombinators.Parsec

import Text.XML.HXT.DOM.Interface
import Text.XML.HXT.DOM.XmlNode                 ( mkDTDElem'
                                                , mkText'
                                                , mkCharRef'
                                                , mkEntityRef'
                                                , mergeText
                                                )
import Text.XML.HXT.Parser.XmlCharParser        ( xmlNameChar
                                                , xmlNameStartChar
                                                , xmlNCNameChar
                                                , xmlNCNameStartChar
                                                , xmlSpaceChar
                                                , xmlCRLFChar
                                                , XParser
                                                )

-- ------------------------------------------------------------
--
-- Character (2.2) and White Space (2.3)
--
-- Unicode parsers in module XmlCharParser

-- ------------------------------------------------------------

sPace           :: XParser s String
sPace
    = many1 xmlSpaceChar

sPace0          :: XParser s String
sPace0
    = many xmlSpaceChar

skipS           :: XParser s ()
skipS
    = skipMany1 xmlSpaceChar

skipS0          :: XParser s ()
skipS0
    = skipMany xmlSpaceChar

-- ------------------------------------------------------------
--
-- Names and Tokens (2.3)

asciiLetter             :: XParser s Char
asciiLetter
    = satisfy (\ c -> ( c >= 'A' && c <= 'Z' ||
                        c >= 'a' && c <= 'z' )
              )
      <?> "ASCII letter"

name            :: XParser s String
name
    = do
      s1 <- xmlNameStartChar
      sl <- many xmlNameChar
      return (s1 : sl)
      <?> "Name"

-- Namespaces in XML: Rules [4-5] NCName:

ncName          :: XParser s String
ncName
    = do
      s1 <- xmlNCNameStartChar
      sl <- many xmlNCNameChar
      return (s1 : sl)
      <?> "NCName"

-- Namespaces in XML: Rules [6-8] QName:

qName           :: XParser s (String, String)
qName
    = do
      s1 <- ncName
      s2 <- option "" (char ':' >> ncName)
      return ( if null s2
               then (s2, s1)
               else (s1, s2)
             )

nmtoken         :: XParser s String
nmtoken
    = try (many1 xmlNameChar)
      <?> "Nmtoken"

names           :: XParser s [String]
names
    = sepBy1 name sPace

nmtokens        :: XParser s [String]
nmtokens
    = sepBy1 nmtoken sPace

-- ------------------------------------------------------------
--
-- Literals (2.3)

singleChar              :: String -> XParser s Char
singleChar notAllowed
    = satisfy (\ c -> isXmlCharCR c && c `notElem` notAllowed)
      <|>
      xmlCRLFChar

singleChars             :: String -> XParser s String
singleChars notAllowed
    = many1 (singleChar notAllowed)

entityValue     :: XParser s String
entityValue
    = ( do
        v <- entityValueDQ
        return ("\"" ++ v ++ "\"")
      )
      <|>
      ( do
        v <- entityValueSQ
        return ("'" ++ v ++ "'")
      )
      <?> "entity value (in quotes)"

entityValueDQ   :: XParser s String
entityValueDQ
    = between dq dq (concRes $ many $ attrChar "&\"")

entityValueSQ   :: XParser s String
entityValueSQ
    = between sq sq (concRes $ many $ attrChar "&\'")

attrValue       :: XParser s String
attrValue
    = ( do
        v <- attrValueDQ
        return ("\"" ++ v ++ "\"")
      )
      <|>
      ( do
        v <- attrValueSQ
        return ("'" ++ v ++ "'")
      )
      <?> "attribute value (in quotes)"

attrValueDQ     :: XParser s String
attrValueDQ
    = between dq dq (concRes $ many $ attrChar "<&\"")

attrValueSQ     :: XParser s String
attrValueSQ
    = between sq sq (concRes $ many $ attrChar "<&\'")

attrChar        :: String -> XParser s String
attrChar notAllowed
    = reference
      <|>
      mkList (singleChar notAllowed)
      <?> ("legal attribute or entity character or reference (not allowed: " ++ show notAllowed ++ " )")

systemLiteral           :: XParser s String
systemLiteral
    = between dq dq (many $ noneOf "\"")
      <|>
      between sq sq (many $ noneOf "\'")
      <?> "system literal (in quotes)"

pubidLiteral            :: XParser s String
pubidLiteral
    = between dq dq (many $ pubidChar "\'")
      <|>
      between sq sq (many $ pubidChar "")
      <?> "pubid literal (in quotes)"
      where
      pubidChar         :: String -> XParser s Char
      pubidChar quoteChars
          = asciiLetter
            <|>
            digit
            <|>
            oneOf " \r\n"               -- no "\t" allowed, so xmlSpaceChar parser not used
            <|>
            oneOf "-()+,./:=?;!*#@$_%"
            <|>
            oneOf quoteChars

-- ------------------------------------------------------------
--
-- Character and Entity References (4.1)

reference       :: XParser s String
reference
    = ( do
        i <- charRef
        return ("&#" ++ show i ++ ";")
      )
      <|>
      ( do
        n <- entityRef
        return ("&" ++ n ++ ";")
      )

checkCharRef    :: Int -> XParser s Int
checkCharRef i
    = if ( i <= fromEnum (maxBound::Char)
           && isXmlChar (toEnum i)
         )
        then return i
        else unexpected ("illegal value in character reference: " ++ intToCharRef i ++ " , in hex: " ++ intToCharRefHex i)

charRef         :: XParser s Int
charRef
    = do
      checkString "&#x"
      d <- many1 hexDigit
      semi
      checkCharRef (hexStringToInt d)
      <|>
      do
      checkString "&#"
      d <- many1 digit
      semi
      checkCharRef (decimalStringToInt d)
      <?> "character reference"

entityRef       :: XParser s String
entityRef
    = do
      amp
      n <- name
      semi
      return n
      <?> "entity reference"

peReference     :: XParser s String
peReference
    = try ( do
            _ <- char '%'
            n <- name
            semi
            return n
          )
      <?> "parameter-entity reference"

-- ------------------------------------------------------------
--
-- 4.3

encName         :: XParser s String
encName
    = do
      c <- asciiLetter
      r <- many (asciiLetter <|> digit <|> oneOf "._-")
      return (c:r)

versionNum      :: XParser s String
versionNum
    = many1 xmlNameChar


-- ------------------------------------------------------------
--
-- keywords

keyword         :: String -> XParser s String
keyword kw
    = try ( do
            n <- name
            if n == kw
              then return n
              else unexpected n
          )
      <?> kw

keywords        :: [String] -> XParser s String
keywords
    = foldr1 (<|>) . map keyword

-- ------------------------------------------------------------
--
-- parser for quoted attribute values

quoted          :: XParser s a -> XParser s a
quoted p
    = between dq dq p
      <|>
      between sq sq p

-- ------------------------------------------------------------
--
-- simple char parsers

dq, sq, lt, gt, semi, amp    :: XParser s ()

dq      = char '\"' >> return ()
sq      = char '\'' >> return ()
lt      = char '<'  >> return ()
gt      = char '>'  >> return ()
semi    = char ';'  >> return ()
amp     = char '&'  >> return ()

{-# INLINE  dq #-}
{-# INLINE  sq #-}
{-# INLINE  lt #-}
{-# INLINE  gt #-}
{-# INLINE  semi #-}
{-# INLINE  amp #-}

separator       :: Char -> XParser s ()
separator c
    = do
      _ <- try ( do
                 skipS0
                 char c
               )
      skipS0
      <?> [c]

bar, comma, eq, lpar, rpar      :: XParser s ()

bar     = separator '|'
comma   = separator ','
eq      = separator '='

{-# INLINE bar #-}
{-# INLINE comma #-}
{-# INLINE eq #-}

lpar    = char '(' >> skipS0
{-# INLINE lpar #-}

rpar    = skipS0 >> char ')' >> return ()
{-# INLINE rpar #-}

checkString     :: String -> XParser s ()
checkString s
    = try $ string s >> return ()
{-# INLINE checkString #-}

-- ------------------------------------------------------------
--
-- all chars but not a special substring

allBut          :: (XParser s Char -> XParser s String) -> String -> XParser s String
allBut p str
    = allBut1 p (const True) str

allBut1         :: (XParser s Char -> XParser s String) -> (Char -> Bool) -> String -> XParser s String
allBut1 p prd (c:rest)
    = p ( satisfy (\ x -> isXmlCharCR x && prd x && not (x == c) )
          <|>
          xmlCRLFChar
          <|>
          try ( char c
                >>
                notFollowedBy (try (string rest) >> return c)
                >>
                return c
              )
        )

allBut1 _p _prd str
    = error ("allBut1 _ _ " ++ show str ++ " is undefined")

-- ------------------------------------------------------------
--
-- concatenate parse results

concRes         :: XParser s [[a]] -> XParser s [a]
concRes p
    = do
      sl <- p
      return (concat sl)

mkList          :: XParser s a -> XParser s [a]
mkList p
    = do
      r <- p
      return [r]

-- ------------------------------------------------------------
--
-- token parsers returning XmlTrees
--
-- ------------------------------------------------------------
--
-- Literals (2.3)

nameT           :: XParser s XmlTree
nameT
    = do
      n <- name
      return (mkDTDElem' NAME [(a_name, n)] [])

nmtokenT        :: XParser s XmlTree
nmtokenT
    = do
      n <- nmtoken
      return (mkDTDElem' NAME [(a_name, n)] [])


entityValueT    :: XParser s XmlTrees
entityValueT
    =  do
       sl <- between dq dq (entityTokensT "%&\"")
       return sl
       <|>
       do
       sl <- between sq sq (entityTokensT "%&\'")
       return sl
       <?> "entity value (in quotes)"

entityTokensT   :: String -> XParser s XmlTrees
entityTokensT notAllowed
    = many (entityCharT notAllowed)

entityCharT     :: String -> XParser s XmlTree
entityCharT notAllowed
    = peReferenceT
      <|>
      charRefT
      <|>
      bypassedEntityRefT
      <|>
      ( do
        cs <- many1 (singleChar notAllowed)
        return (mkText' cs)
      )

attrValueT      :: XParser s XmlTrees
attrValueT
    = between dq dq (attrValueT' "<&\"")
      <|>
      between sq sq (attrValueT' "<&\'")
      <?> "attribute value (in quotes)"

attrValueT'     :: String -> XParser s XmlTrees
attrValueT' notAllowed
    = mergeTextNodes <$> many ( referenceT <|> singleCharsT notAllowed)

singleCharsT    :: String -> XParser s XmlTree
singleCharsT notAllowed
    = do
      cs <- singleChars notAllowed
      return (mkText' cs)

-- ------------------------------------------------------------
--
-- Character and Entity References (4.1)

referenceT      :: XParser s XmlTree
referenceT
    = charRefT
      <|>
      entityRefT

charRefT        :: XParser s XmlTree
charRefT
    = do
      i <- charRef
      return (mkCharRef' i)

entityRefT      :: XParser s XmlTree
entityRefT
    = do
      n <- entityRef
      return $! (maybe (mkEntityRef' n) mkCharRef' . lookup n $ predefinedXmlEntities)

-- optimization: predefined XML entity refs are converted into equivalent char refs
-- so there is no need for an entitiy substitution phase, if there is no DTD
-- Attention: entityRefT must only be called from within XML/HTML content
-- in DTD parsing this optimization is not allowed because of different semantics
-- of charRefs and entityRefs during substitution of entites in ENTITY definitions

predefinedXmlEntities   :: [(String, Int)]
predefinedXmlEntities
    = [ ("lt",   60)
      , ("gt",   62)
      , ("amp",  38)
      , ("apos", 39)
      , ("quot", 34)
      ]

bypassedEntityRefT      :: XParser s XmlTree
bypassedEntityRefT
    = do
      n <- entityRef
      return $! (mkText' ("&" ++ n ++ ";"))

peReferenceT    :: XParser s XmlTree
peReferenceT
    = do
      r <- peReference
      return $! (mkDTDElem' PEREF [(a_peref, r)] [])

-- ------------------------------------------------------------

mergeTextNodes :: XmlTrees -> XmlTrees
mergeTextNodes
    = foldr addText []
    where 
      addText :: XmlTree -> XmlTrees -> XmlTrees
      addText t []
          = [t]
      addText t (t1 : ts)
          = mergeText t t1 ++ ts

-- ------------------------------------------------------------