{- |
   This module defines an internal (generic) representation for XML
   documents including their DTDs.

   History:
   The original module was derived by hand from the XML specification,
   following the grammar precisely.  Then we simplified the types,
   removing layers of indirection and redundancy, and generally making
   things easier to work with.  Then we allowed PEReferences to be
   ubiquitous, by removing them from the types and resolving all
   PE references at parse-time.  Finally, we added a per-document
   symbol table for GEReferences, and a whitespace-significance flag
   for plaintext.
-}

module Text.XML.HaXml.Types
  (
  -- * A simple symbol table mapping strings (references) to values.
    SymTab
  -- ** Symbol table operations
  , emptyST
  , addST
  , lookupST

  -- * XML Types
  -- ** The top-level document container
  , Document(..)

  -- ** The main document content
  , Element(..)
  , ElemTag(..)
  , Content(..)
  , Attribute
  , AttValue(..)
  , info

  -- ** Administrative parts of the document
  , Prolog(..)
  , XMLDecl(..)
  , Misc(..)
  , ProcessingInstruction
  , SDDecl
  , VersionInfo
  , Comment
  , PITarget

  -- ** The DTD
  -- *** content model
  , DocTypeDecl(..)
  , MarkupDecl(..)
  , ExtSubset(..)
  , ExtSubsetDecl(..)
  , ElementDecl(..)
  , ContentSpec(..)
  , CP(..)
  , Modifier(..)
  , Mixed(..)

  -- *** attribute model
  , AttListDecl(..)
  , AttDef(..)
  , AttType(..)
  , TokenizedType(..)
  , EnumeratedType(..)
  , NotationType
  , Enumeration
  , DefaultDecl(..)
  , FIXED(..)

  -- *** conditional sections
  , ConditionalSect(..)
  , IncludeSect
  , IgnoreSect
  , Ignore(..)
  , IgnoreSectContents(..)

  -- ** References
  , Reference(..)
  , EntityRef
  , CharRef
  , PEReference

  -- ** Entities
  , EntityDecl(..)
  , GEDecl(..)
  , PEDecl(..)
  , EntityDef(..)
  , PEDef(..)
  , ExternalID(..)
  , NDataDecl(..)
  , TextDecl(..)
  , ExtParsedEnt(..)
  , ExtPE(..)
  , NotationDecl(..)
  , PublicID(..)
  , EncodingDecl(..)
  , EntityValue(..)
  , EV(..)
  , PubidLiteral(..)
  , SystemLiteral(..)

  -- ** Namespaces
  , QName(..)
  , Namespace(..)

  -- ** Basic value types
  , Name
  , Names
  , NmToken
  , NmTokens
  , CharData
  , CDSect
  ) where


{- A simple symbol table for storing macros whilst parsing. -}

type SymTab a = [(String,a)]

emptyST :: SymTab a
emptyST  = []

addST :: String -> a -> SymTab a -> SymTab a
addST n v = ((n,v):)

lookupST :: String -> SymTab a -> Maybe a
lookupST = lookup



{- XML types start here -}

-- | The symbol table stored in a document holds all its general entity
--   reference definitions.
data Document i = Document Prolog (SymTab EntityDef) (Element i) [Misc]
                  deriving (Eq, Show)
data Prolog     = Prolog (Maybe XMLDecl) [Misc] (Maybe DocTypeDecl) [Misc]
                  deriving (Eq, Show)
data XMLDecl    = XMLDecl VersionInfo (Maybe EncodingDecl) (Maybe SDDecl)
                  deriving (Eq, Show)
data Misc       = Comment Comment
                | PI ProcessingInstruction
                deriving (Eq, Show)

type ProcessingInstruction = (PITarget,String)

type SDDecl      = Bool
type VersionInfo = String
type Comment     = String
type PITarget    = String

data DocTypeDecl = DTD QName (Maybe ExternalID) [MarkupDecl]  deriving (Eq, Show)
data MarkupDecl  = Element  ElementDecl
                 | AttList  AttListDecl
                 | Entity   EntityDecl
                 | Notation NotationDecl
                 | MarkupMisc Misc
                 deriving (Eq, Show)

data ExtSubset     = ExtSubset (Maybe TextDecl) [ExtSubsetDecl]  deriving (Eq, Show)
data ExtSubsetDecl = ExtMarkupDecl MarkupDecl
                   | ExtConditionalSect ConditionalSect
                   deriving (Eq, Show)

data Element i = Elem QName [Attribute] [Content i] deriving (Eq, Show)
--  ElemTag is an intermediate type for parsing only
data ElemTag   = ElemTag QName [Attribute]
type Attribute = (QName, AttValue)
data Content i = CElem (Element i) i
               | CString Bool CharData i
                        -- ^ bool is whether whitespace is significant
               | CRef Reference i
               | CMisc Misc i
               deriving Show

-- custom instance of Eq, ignoring the informational elements.
instance Eq (Content i) where
    (CElem e _)     == (CElem e' _)       =  e==e'
    (CString b c _) == (CString b' c' _)  =  b==b' && c==c'
    (CRef r _)      == (CRef r' _)        =  r==r'
    (CMisc m _)     == (CMisc m' _)       =  m==m'

info :: Content t -> t
info (CElem _ i) = i
info (CString _ _ i) = i
info (CRef _ i) = i
info (CMisc _ i) = i

instance Functor Document where
  fmap f (Document p st e ms) = Document p st (fmap f e) ms
instance Functor Element where
  fmap f (Elem t as cs) = Elem t as (map (fmap f) cs)
instance Functor Content where
  fmap f (CElem e i)     = CElem (fmap f e) (f i)
  fmap f (CString b s i) = CString b s (f i)
  fmap f (CRef r i)      = CRef r (f i)
  fmap f (CMisc m i)     = CMisc m (f i)

data ElementDecl = ElementDecl QName ContentSpec deriving (Eq, Show)
data ContentSpec = EMPTY
                 | ANY
                 | Mixed Mixed
                 | ContentSpec CP
                 deriving (Eq, Show)
data CP = TagName QName Modifier
        | Choice [CP] Modifier
        | Seq [CP] Modifier
        deriving (Eq, Show)
data Modifier = None  -- ^ Just One
              | Query -- ^ Zero Or One
              | Star  -- ^ Zero Or More
              | Plus  -- ^ One Or More
              deriving (Eq, Show)
data Mixed = PCDATA
           | PCDATAplus [QName]
           deriving (Eq, Show)
data AttListDecl = AttListDecl QName [AttDef] deriving (Eq, Show)
data AttDef      = AttDef QName AttType DefaultDecl deriving (Eq, Show)
data AttType     = StringType
                 | TokenizedType TokenizedType
                 | EnumeratedType EnumeratedType
                 deriving (Eq, Show)
data TokenizedType = ID
                   | IDREF
                   | IDREFS
                   | ENTITY
                   | ENTITIES
                   | NMTOKEN
                   | NMTOKENS
                   deriving (Eq, Show)
data EnumeratedType = NotationType NotationType
                    | Enumeration Enumeration
                    deriving (Eq, Show)
type NotationType   = [Name]    -- nonempty list
type Enumeration    = [NmToken] -- nonempty list
data DefaultDecl    = REQUIRED
                    | IMPLIED
                    | DefaultTo AttValue (Maybe FIXED)
                    deriving (Eq, Show)
data FIXED          = FIXED deriving (Eq, Show)

data ConditionalSect = IncludeSect IncludeSect
                     | IgnoreSect IgnoreSect
                     deriving (Eq, Show)
type IncludeSect = [ExtSubsetDecl]
type IgnoreSect  = [IgnoreSectContents]
data Ignore      = Ignore deriving (Eq, Show)
data IgnoreSectContents = IgnoreSectContents Ignore [(IgnoreSectContents,Ignore)]  deriving (Eq, Show)

data Reference    = RefEntity EntityRef
                  | RefChar CharRef
                  deriving (Eq,Show)
type EntityRef    = Name
type CharRef      = Int
type PEReference  = Name

data EntityDecl   = EntityGEDecl GEDecl
                  | EntityPEDecl PEDecl
                  deriving (Eq, Show)
data GEDecl       = GEDecl Name EntityDef deriving (Eq, Show)
data PEDecl       = PEDecl Name PEDef deriving (Eq, Show)
data EntityDef    = DefEntityValue EntityValue
                  | DefExternalID ExternalID (Maybe NDataDecl)
                  deriving (Eq, Show)
data PEDef        = PEDefEntityValue EntityValue
                  | PEDefExternalID ExternalID deriving (Eq,Show)
data ExternalID   = SYSTEM SystemLiteral
                  | PUBLIC PubidLiteral SystemLiteral deriving (Eq,Show)
newtype NDataDecl = NDATA Name  deriving (Eq, Show)

data TextDecl       = TextDecl (Maybe VersionInfo) EncodingDecl  deriving (Eq, Show)
data ExtParsedEnt i = ExtParsedEnt (Maybe TextDecl) (Content i) deriving (Eq, Show)
data ExtPE          = ExtPE (Maybe TextDecl) [ExtSubsetDecl] deriving (Eq, Show)

data NotationDecl    = NOTATION Name (Either ExternalID PublicID) deriving (Eq, Show)
newtype PublicID     = PUBLICID PubidLiteral deriving (Eq, Show)
newtype EncodingDecl = EncodingDecl String deriving (Eq, Show)

-- | A QName is a (possibly) qualified name, in the sense of XML namespaces.
data QName    = N  Name
              | QN Namespace Name deriving (Eq,Show)
-- | Namespaces are not defined in the XML spec itself, but at
--       http://www.w3.org/TR/xml-names
data Namespace = Namespace { nsPrefix  :: String
                           , nsURI     :: String
                           }
                 deriving (Show)
instance Eq Namespace where
    p == q  =  nsURI p == nsURI q     -- this is the W3C spec's definition!
instance Ord QName where
    compare (N n)    (N m)    = compare n m
    compare (QN p n) (N m)    = LT
    compare (N n)    (QN q m) = GT
    compare (QN p n) (QN q m) = case compare (nsPrefix p) (nsPrefix q) of
                                  EQ -> compare n m
                                  r  -> r

type Name     = String           -- non-empty string
type Names    = [Name]           -- non-empty list
type NmToken  = String           -- non-empty string
type NmTokens = [NmToken]        -- non-empty list

data AttValue    = AttValue [Either String Reference] deriving Eq
instance Show AttValue where
  show (AttValue v) = concatMap decode v
    where
      decode (Left  w)               = w
      decode (Right (RefEntity ent)) = "&"++ent++";"
      decode (Right (RefChar cref))  = "&"++show cref++";"

data EntityValue = EntityValue [EV] deriving (Eq,Show)
data EV = EVString String
 --  -- | EVPERef PEReference
        | EVRef Reference  deriving (Eq,Show)
newtype PubidLiteral  = PubidLiteral String deriving (Eq,Show)
newtype SystemLiteral = SystemLiteral String deriving (Eq,Show)
type CharData         = String
type CDSect           = CharData

instance Eq ElemTag where
    (ElemTag n _) == (ElemTag m _)  = n==m