{-# LANGUAGE FlexibleContexts #-}
module Ideas.Text.XML.Parser (document, extParsedEnt, extSubset) where
import Control.Monad
import Data.Char (toUpper, ord, isSpace)
import Data.List (foldl')
import Data.Maybe (catMaybes)
import Ideas.Text.XML.Document hiding (versionInfo, name, content)
import Ideas.Text.XML.Unicode
import Ideas.Utils.Parsing hiding (digit, letter, space)
import Prelude hiding (seq)
import qualified Ideas.Text.XML.Document as D
letter, digit, combiningChar, extender :: Parser Char
letter = satisfy isLetter
digit = satisfy isDigit
combiningChar = satisfy isCombiningChar
extender = satisfy isExtender
parens, brackets, singleQuoted, doubleQuoted :: Parser a -> Parser a
parens = between (char '(') (char ')')
brackets = between (char '[') (char ']')
singleQuoted = between (char '\'') (char '\'')
doubleQuoted = between (char '"') (char '"')
document :: Parser XMLDoc
document = do
(mxml, mdtd) <- prolog
rt <- element
miscs
let (ver, enc, sa) =
case mxml of
Just (a, b, c) -> (Just a, b, c)
Nothing -> (Nothing, Nothing, Nothing)
return XMLDoc
{ D.versionInfo = ver
, D.encoding = enc
, D.standalone = sa
, D.dtd = mdtd
, D.externals = []
, root = rt
}
space :: Parser ()
space = void (many1 (oneOf "\x20\x9\xA\xD"))
mspace :: Parser ()
mspace = void (many (oneOf "\x20\x9\xA\xD"))
nameChar :: Parser Char
nameChar = letter <|> digit <|> combiningChar <|> extender <|> oneOf ".-_:"
name :: Parser String
name = do
c <- letter <|> oneOf "_:"
cs <- many nameChar
return (c:cs)
spacedName :: Parser String
spacedName = space *> name <* space
nmtoken :: Parser String
nmtoken = many1 nameChar
entityValue :: Parser EntityValue
entityValue = doubleQuoted (p "%&\"") <|> singleQuoted (p "%&'")
where
p s = many (fmap Left (noneOf s)
<|> fmap Right (fmap Left peReference <|> fmap Right reference))
attValue :: Parser AttValue
attValue = doubleQuoted (p "<&\"") <|> singleQuoted (p "<&'")
where p s = many (fmap Left (noneOf s) <|> fmap Right reference)
systemLiteral :: Parser String
systemLiteral = doubleQuoted (p "\"") <|> singleQuoted (p "'")
where p s = many (noneOf s)
pubidLiteral :: Parser String
pubidLiteral = doubleQuoted (many (pubidChar True)) <|> singleQuoted (many (pubidChar False))
pubidChar :: Bool -> Parser Char
pubidChar withSingleQuote =
ranges xs <|> oneOf "\x20\xD\xA-()+,./:=?;!*#@$_%" <|> singleQuote
where
xs = [('a', 'z'), ('A', 'Z'), ('0', '9')]
singleQuote
| withSingleQuote = char '\'' >> return '\''
| otherwise = fail "pubidChar"
charData :: Parser String
charData = stopOn ["<", "&", "]]>"]
comment :: Parser String
comment = between (string "<!--") (string "-->") (stopOn ["--"])
pInstr :: Parser String
pInstr = between (string "<?") (string "?>") p
where
p = piTarget >> option "" (space >> stopOn ["?>"])
piTarget :: Parser String
piTarget = do
n <- name
when (map toUpper n == "XML") $ fail "XML in piTarget"
return n
cdSect :: Parser XML
cdSect = between (string "<![CDATA[") (string "]]>") p
where
p = do
s <- stopOn ["]]>"]
return (CDATA s)
type XMLDecl = (String, Maybe String, Maybe Bool)
prolog :: Parser (Maybe XMLDecl, Maybe DTD)
prolog = do
ma <- optionMaybe (try xmlDecl)
miscs
mb <- optionMaybe $ try $ do
mb <- doctypedecl
miscs
return mb
return (ma, mb)
xmlDecl :: Parser XMLDecl
xmlDecl = do
skip (string "<?xml")
x <- versionInfo
y <- optionMaybe (try encodingDecl)
z <- optionMaybe (try sdDecl)
mspace
skip (string "?>")
return (x, y, z)
versionInfo :: Parser String
versionInfo = space >> string "version" >> eq >> p
where p = singleQuoted versionNum <|> doubleQuoted versionNum
eq :: Parser ()
eq = skip (mspace >> char '=' >> mspace)
versionNum :: Parser String
versionNum = do
skip (string "1.0")
return "1.0"
misc :: Parser ()
misc = try (skip comment) <|> try (skip pInstr) <|> skip space
miscs :: Parser ()
miscs = skip (many misc)
doctypedecl :: Parser DTD
doctypedecl = do
skip (string "<!DOCTYPE")
space
x <- name
y <- optionMaybe (try (space >> externalID))
mspace
z <- option [] $ do
z <- brackets intSubset
mspace
return z
skip (char '>')
return (DTD x y z)
declSep :: Parser (Maybe DocTypeDecl)
declSep = fmap (Just . DTDParameter) peReference
<|> (space >> return Nothing)
intSubset :: Parser [DocTypeDecl]
intSubset = fmap catMaybes (many (markupdecl <|> declSep))
markupdecl :: Parser (Maybe DocTypeDecl)
markupdecl = fmap Just (choice (map try list))
<|> ((try pInstr <|> comment) >> return Nothing)
where
list = [elementdecl, attlistDecl, entityDecl, notationDecl]
extSubset :: Parser (Maybe TextDecl, [DocTypeDecl])
extSubset = do
m <- optionMaybe textDecl
e <- extSubsetDecl
return (m, e)
extSubsetDecl :: Parser [DocTypeDecl]
extSubsetDecl = fmap catMaybes (many (choice [markupdecl, fmap (Just . DTDConditional) conditionalSect, declSep]))
sdDecl :: Parser Bool
sdDecl = space >> string "standalone" >> eq >> (singleQuoted bool <|> doubleQuoted bool)
where bool = (string "yes" >> return True)
<|> (string "no" >> return False)
element :: Parser Element
element = do
(s1, as, closed) <- sTag
if closed
then return (Element s1 as [])
else do
c <- content
s2 <- eTag
when (s1/=s2) $ fail "WFC: element"
return (Element s1 as c)
sTag :: Parser (Name, Attributes, Bool)
sTag = do
skip (char '<')
n <- name
as <- many (try (space >> attribute))
mspace
b <- (char '>' >> return False) <|>
(string "/>" >> return True)
return (n, as, b)
attribute :: Parser Attribute
attribute = do
n <- name
eq
a <- attValue
return (n := a)
eTag :: Parser Name
eTag = do
skip (string "</")
n <- name
mspace
skip (char '>')
return n
content :: Parser Content
content = chainr1 (fmap g charData) (fmap f ps)
where
f ma l r = l ++ maybe [] return ma ++ r
g s = [ CharData s | any (not . isSpace) s ]
ps = try (fmap Just (choice (map try [fmap Tagged element, fmap Reference reference, cdSect]))
<|> ((try pInstr <|> comment) >> return Nothing))
elementdecl :: Parser DocTypeDecl
elementdecl = do
skip (string "<!ELEMENT")
n <- spacedName
cs <- contentspec
mspace
skip (char '>')
return (ElementDecl n cs)
contentspec :: Parser ContentSpec
contentspec = choice
[ string "EMPTY" >> return Empty
, string "ANY" >> return Any
, try mixed
, children
]
children :: Parser ContentSpec
children = do
a <- try cpChoice <|> cpSeq
f <- option id multi
return (Children (f a))
multi :: Parser (CP -> CP)
multi = (char '?' >> return QuestionMark)
<|> (char '*' >> return Star)
<|> (char '+' >> return Plus)
cp :: Parser CP
cp = do
a <- fmap CPName name <|> try cpChoice <|> cpSeq
f <- option id multi
return (f a)
cpChoice :: Parser CP
cpChoice = parens $ do
mspace
x <- cp
xs <- many1 (try (mspace >> char '|' >> mspace >> cp))
mspace
return (Choice (x:xs))
cpSeq :: Parser CP
cpSeq = parens $ do
mspace
x <- cp
xs <- many (try (mspace >> char ',' >> mspace >> cp))
mspace
return (Sequence (x:xs))
mixed :: Parser ContentSpec
mixed = char '(' >> mspace >> string "#PCDATA" >> (rest1 <|> rest2)
where
p = mspace >> char '|' >> mspace >> name
rest1 = try $ do
xs <- many (try p)
mspace
skip (string ")*")
return (Mixed True xs)
rest2 = mspace >> char ')' >> return (Mixed False [])
attlistDecl :: Parser DocTypeDecl
attlistDecl = do
skip (string "<!ATTLIST")
space
n <- name
ds <- many (try attDef)
mspace
skip (char '>')
return (AttListDecl n ds)
attDef :: Parser AttDef
attDef = do
n <- spacedName
tp <- attType
space
dd <- defaultDecl
return (n, tp, dd)
attType :: Parser AttType
attType = stringType <|> tokenizedType <|> enumeratedType
stringType :: Parser AttType
stringType = string "CDATA" >> return StringType
tokenizedType :: Parser AttType
tokenizedType = choice (map f xs)
where
f (tp, s) = try (string s) >> return tp
xs = [ (IdRefsType, "IDREFS"), (IdRefType, "IDREF"), (IdType, "ID"), (EntityType, "ENTITY")
, (EntitiesType, "ENTITIES"), (NmTokensType, "NMTOKENS"), (NmTokenType, "NMTOKEN")
]
enumeratedType :: Parser AttType
enumeratedType = notationType <|> enumeration
notationType :: Parser AttType
notationType = string "NOTATION" >> space >> parens p
where
p = do
mspace
n <- name
ns <- many (try (mspace >> char '|' >> mspace >> name))
mspace
return (NotationType (n:ns))
enumeration :: Parser AttType
enumeration = parens $ do
mspace
x <- nmtoken
xs <- many (try (mspace >> char '|' >> mspace >> nmtoken))
mspace
return (EnumerationType (x:xs))
defaultDecl :: Parser DefaultDecl
defaultDecl = try (string "#REQUIRED" >> return Required)
<|> try (string "#IMPLIED" >> return Implied)
<|> do f <- option Value (string "#FIXED" >> space >> return Fixed)
a <- attValue
return (f a)
conditionalSect :: Parser Conditional
conditionalSect = try includeSect <|> ignoreSect
includeSect :: Parser Conditional
includeSect = do
skip (string "<![")
mspace
skip (string "INCLUDE")
mspace
skip (char '[')
ds <- extSubsetDecl
skip (string "]]>")
return (Include ds)
ignoreSect :: Parser Conditional
ignoreSect = do
skip (string "<![")
mspace
skip (string "IGNORE")
mspace
skip (char '[')
xss <- many ignoreSectContents
skip (string "]]>")
return (Ignore (concat xss))
ignoreSectContents :: Parser [String]
ignoreSectContents =
do x <- ignore
xss <- many $ do
skip (string "<![")
ys <- ignoreSectContents
skip (string "]]>")
y <- ignore
return (ys++[y])
return (x:concat xss)
ignore :: Parser String
ignore = stopOn ["<![", "]]>"]
charRef :: Parser Reference
charRef = do
skip (string "&#")
n <- p <|> (char 'x' >> q)
skip (char ';')
return (CharRef n)
where
p = fmap (foldl' (\a b -> a*10+ord b-48) 0) (many1 ('0' <..> '9'))
q = fmap hexa (many1 (ranges [('0', '9'), ('a', 'f'), ('A', 'F')]))
hexa :: String -> Int
hexa = rec 0
where
rec n [] = n
rec n (x:xs) = rec (16*n + ord x - correct) xs
where
correct
| x <= '9' = ord '0'
| x <= 'F' = ord 'A' - 10
| otherwise = ord 'a' - 10
reference :: Parser Reference
reference = try entityRef <|> charRef
entityRef :: Parser Reference
entityRef = between (char '&') (char ';') (fmap EntityRef name)
peReference :: Parser Parameter
peReference = between (char '%') (char ';') (fmap Parameter name)
entityDecl :: Parser DocTypeDecl
entityDecl = try geDecl <|> peDecl
geDecl :: Parser DocTypeDecl
geDecl = do
skip (string "<!ENTITY")
n <- spacedName
ed <- entityDef
mspace
skip (char '>')
return (EntityDecl True n ed)
peDecl :: Parser DocTypeDecl
peDecl = do
skip (string "<!ENTITY")
space
skip (char '%')
n <- spacedName
e <- peDef
mspace
skip (char '>')
return (EntityDecl False n (either Left (\a -> Right (a, Nothing)) e))
entityDef :: Parser EntityDef
entityDef = fmap Left entityValue <|> do
e <- externalID
ms <- optionMaybe (try nDataDecl)
return (Right (e, ms))
peDef :: Parser (Either EntityValue ExternalID)
peDef = fmap Left entityValue <|> fmap Right externalID
externalID :: Parser ExternalID
externalID = (string "SYSTEM" >> space >> fmap System systemLiteral) <|> do
skip (string "PUBLIC")
space
x <- pubidLiteral
space
y <- systemLiteral
return (Public x y)
nDataDecl :: Parser String
nDataDecl = space >> string "NDATA" >> space >> name
textDecl :: Parser TextDecl
textDecl = do
skip (string "<?xml")
v <- optionMaybe versionInfo
e <- encodingDecl
mspace
skip (string "?>")
return (v, e)
extParsedEnt :: Parser (Maybe TextDecl, Content)
extParsedEnt = do
td <- optionMaybe (try textDecl)
c <- content
return (td, c)
encodingDecl :: Parser String
encodingDecl = space >> string "encoding" >> eq >>
(singleQuoted encName <|> doubleQuoted encName)
encName :: Parser String
encName = do
x <- ranges [('A', 'Z'), ('a', 'z')]
xs <- many (ranges [('A', 'Z'), ('a', 'z'), ('0', '9')] <|> oneOf "._-")
return (x:xs)
notationDecl :: Parser DocTypeDecl
notationDecl = do
skip (string "<!NOTATION")
n <- spacedName
e <- fmap Left (try externalID) <|> fmap Right publicID
mspace
skip (char '>')
return (NotationDecl n e)
publicID :: Parser PublicID
publicID = string "PUBLIC" >> space >> pubidLiteral