module Text.XML.HXT.Parser.XmlParsec
( charData
, charData'
, comment
, pI
, cDSect
, document
, document'
, prolog
, xMLDecl
, xMLDecl'
, versionInfo
, misc
, doctypedecl
, markupdecl
, sDDecl
, element
, content
, contentWithTextDecl
, textDecl
, encodingDecl
, xread
, parseXmlAttrValue
, parseXmlContent
, parseXmlDocEncodingSpec
, parseXmlDocument
, parseXmlDTDPart
, parseXmlEncodingSpec
, parseXmlEntityEncodingSpec
, parseXmlGeneralEntityValue
, parseXmlPart
, parseXmlText
, parseNMToken
, parseName
, removeEncodingSpec
)
where
import Text.ParserCombinators.Parsec
( GenParser
, Parser
, parse
, (<?>), (<|>)
, char
, string
, eof
, between
, many, many1
, option
, try
, unexpected
, getPosition
, getInput
, sourceName
)
import Text.XML.HXT.DOM.ShowXml
( xshow
)
import Text.XML.HXT.DOM.Interface
import Text.XML.HXT.DOM.XmlNode
( mkElement
, mkAttr
, mkRoot
, mkDTDElem
, mkText
, mkCmt
, mkCdata
, mkError
, mkPi
, isText
, isRoot
, getText
, getChildren
, getAttrl
, getAttrName
, changeAttrl
, mergeAttrl
)
import Text.XML.HXT.Parser.XmlCharParser
( xmlChar
)
import qualified Text.XML.HXT.Parser.XmlTokenParser as XT
import qualified Text.XML.HXT.Parser.XmlDTDTokenParser as XD
import Data.Char (toLower)
import Data.Maybe
charData :: GenParser Char state XmlTrees
charData
= many (charData' <|> XT.referenceT)
charData' :: GenParser Char state XmlTree
charData'
= try ( do
t <- XT.allBut1 many1 (\ c -> not (c `elem` "<&")) "]]>"
return (mkText $! t)
)
comment :: GenParser Char state XmlTree
comment
= ( do
c <- between (try $ string "<!--") (string "-->") (XT.allBut many "--")
return (mkCmt $! c)
) <?> "comment"
pI :: GenParser Char state XmlTree
pI
= between (try $ string "<?") (string "?>")
( do
n <- pITarget
p <- option "" (do
_ <- XT.sPace
XT.allBut many "?>"
)
return $ mkPi (mkName n) [mkAttr (mkName a_value) [mkText p]]
) <?> "processing instruction"
where
pITarget :: GenParser Char state String
pITarget = ( do
n <- XT.name
if map toLower n == t_xml
then unexpected n
else return n
)
cDSect :: GenParser Char state XmlTree
cDSect
= do
t <- between ( try $ string "<![CDATA[") (string "]]>") (XT.allBut many "]]>")
return (mkCdata $! t)
<?> "CDATA section"
document :: GenParser Char state XmlTree
document
= do
pos <- getPosition
dl <- document'
return $ mkRoot [ mkAttr (mkName a_source) [mkText (sourceName pos)]
, mkAttr (mkName a_status) [mkText (show c_ok)]
] dl
document' :: GenParser Char state XmlTrees
document'
= do
pl <- prolog
el <- element
ml <- many misc
eof
return (pl ++ [el] ++ ml)
prolog :: GenParser Char state XmlTrees
prolog
= do
xml <- option [] xMLDecl'
misc1 <- many misc
dtdPart <- option [] doctypedecl
misc2 <- many misc
return (xml ++ misc1 ++ dtdPart ++ misc2)
xMLDecl :: GenParser Char state XmlTrees
xMLDecl
= between (try $ string "<?xml") (string "?>")
( do
vi <- versionInfo
ed <- option [] encodingDecl
sd <- option [] sDDecl
XT.skipS0
return (vi ++ ed ++ sd)
)
<?> "xml declaration"
xMLDecl' :: GenParser Char state XmlTrees
xMLDecl'
= do
al <- xMLDecl
return [mkPi (mkName t_xml) al]
xMLDecl'' :: GenParser Char state XmlTree
xMLDecl''
= do
al <- option [] (try xMLDecl)
return (mkRoot al [])
versionInfo :: GenParser Char state XmlTrees
versionInfo
= ( do
_ <- try ( do
XT.skipS
XT.keyword a_version
)
XT.eq
vi <- XT.quoted XT.versionNum
return [mkAttr (mkName a_version) [mkText vi]]
)
<?> "version info (with quoted version number)"
misc :: GenParser Char state XmlTree
misc
= comment
<|>
pI
<|>
( ( do
ws <- XT.sPace
return (mkText ws)
) <?> ""
)
doctypedecl :: GenParser Char state XmlTrees
doctypedecl
= between (try $ string "<!DOCTYPE") (char '>')
( do
XT.skipS
n <- XT.name
exId <- option [] ( try ( do
XT.skipS
externalID
)
)
XT.skipS0
markup <- option []
( do
m <- between (char '[' ) (char ']') markupOrDeclSep
XT.skipS0
return m
)
return [mkDTDElem DOCTYPE ((a_name, n) : exId) markup]
)
markupOrDeclSep :: GenParser Char state XmlTrees
markupOrDeclSep
= ( do
ll <- many ( markupdecl
<|>
declSep
<|>
XT.mkList conditionalSect
)
return (concat ll)
)
declSep :: GenParser Char state XmlTrees
declSep
= XT.mkList XT.peReferenceT
<|>
( do
XT.skipS
return []
)
markupdecl :: GenParser Char state XmlTrees
markupdecl
= XT.mkList
( pI
<|>
comment
<|>
XD.dtdDeclTokenizer
)
sDDecl :: GenParser Char state XmlTrees
sDDecl
= do
_ <- try (do
XT.skipS
XT.keyword a_standalone
)
XT.eq
sd <- XT.quoted (XT.keywords [v_yes, v_no])
return [mkAttr (mkName a_standalone) [mkText sd]]
element :: GenParser Char state XmlTree
element
= ( do
e <- elementStart
elementRest e
) <?> "element"
elementStart :: GenParser Char state (String, [(String, XmlTrees)])
elementStart
= do
n <- ( try ( do
_ <- char '<'
n <- XT.name
return n
)
<?> "start tag"
)
ass <- attrList
XT.skipS0
return (n, ass)
where
attrList
= option [] ( do
XT.skipS
attrList'
)
attrList'
= option [] ( do
a1 <- attribute
al <- attrList
let (n, _v) = a1
if isJust . lookup n $ al
then unexpected ("attribute name " ++ show n ++ " occurs twice in attribute list")
else return (a1 : al)
)
elementRest :: (String, [(String, XmlTrees)]) -> GenParser Char state XmlTree
elementRest (n, al)
= ( do
_ <- try $ string "/>"
return $! (mkElement (mkName n) (map (mkA $!) al) [])
)
<|>
( do
_ <- XT.gt
c <- content
eTag n
return $! (mkElement (mkName n) (map (mkA $!) al) $! c)
)
<?> "proper attribute list followed by \"/>\" or \">\""
where
mkA (n', ts') = mkAttr (mkName n') ts'
eTag :: String -> GenParser Char state ()
eTag n'
= do
_ <- try ( string "</" ) <?> ""
n <- XT.name
XT.skipS0
_ <- XT.gt
if n == n'
then return ()
else unexpected ("illegal end tag </" ++ n ++ "> found, </" ++ n' ++ "> expected")
attribute :: GenParser Char state (String, XmlTrees)
attribute
= do
n <- XT.name
XT.eq
v <- XT.attrValueT
return (n, v)
content :: GenParser Char state XmlTrees
content
= do
c1 <- charData
cl <- many
( do
l <- ( element
<|>
cDSect
<|>
pI
<|>
comment
)
c <- charData
return (l : c)
)
return (c1 ++ concat cl)
contentWithTextDecl :: GenParser Char state XmlTrees
contentWithTextDecl
= do
_ <- option [] textDecl
content
conditionalSect :: GenParser Char state XmlTree
conditionalSect
= do
_ <- try $ string "<!["
cs <- many XD.dtdToken
_ <- char '['
sect <- condSectCont
return (mkDTDElem CONDSECT [(a_value, sect)] cs)
where
condSectCont :: GenParser Char state String
condSectCont
= ( do
_ <- try $ string "]]>"
return ""
)
<|>
( do
_ <- try $ string "<!["
cs1 <- condSectCont
cs2 <- condSectCont
return ("<![" ++ cs1 ++ "]]>" ++ cs2)
)
<|>
( do
c <- xmlChar
cs <- condSectCont
return (c : cs)
)
externalID :: GenParser Char state Attributes
externalID
= ( do
_ <- XT.keyword k_system
XT.skipS
lit <- XT.systemLiteral
return [(k_system, lit)]
)
<|>
( do
_ <- XT.keyword k_public
XT.skipS
pl <- XT.pubidLiteral
XT.skipS
sl <- XT.systemLiteral
return [ (k_system, sl)
, (k_public, pl) ]
)
<?> "SYSTEM or PUBLIC declaration"
textDecl :: GenParser Char state XmlTrees
textDecl
= between (try $ string "<?xml") (string "?>")
( do
vi <- option [] versionInfo
ed <- encodingDecl
XT.skipS0
return (vi ++ ed)
)
<?> "text declaration"
textDecl'' :: GenParser Char state XmlTree
textDecl''
= do
al <- option [] (try textDecl)
return (mkRoot al [])
encodingDecl :: GenParser Char state XmlTrees
encodingDecl
= do
_ <- try ( do
XT.skipS
XT.keyword a_encoding
)
XT.eq
ed <- XT.quoted XT.encName
return [mkAttr (mkName a_encoding) [mkText ed]]
xread :: String -> XmlTrees
xread str
= parseXmlFromString parser loc str
where
loc = "string: " ++ show (if length str > 40 then take 40 str ++ "..." else str)
parser = do
res <- content
eof
return res
parseXmlContent :: XmlTree -> XmlTrees
parseXmlContent
= xread . xshow . (:[])
parseXmlText :: Parser XmlTrees -> String -> XmlTree -> XmlTrees
parseXmlText p loc = parseXmlFromString p loc . xshow . (:[])
parseXmlDocument :: String -> String -> XmlTrees
parseXmlDocument = parseXmlFromString document'
parseXmlFromString :: Parser XmlTrees -> String -> String -> XmlTrees
parseXmlFromString parser loc
= either ((:[]) . mkError c_err . (++ "\n") . show) id . parse parser loc
removeEncodingSpec :: XmlTree -> XmlTrees
removeEncodingSpec t
| isText t
= ( either ((:[]) . mkError c_err . (++ "\n") . show) ((:[]) . mkText)
. parse parser "remove encoding spec"
. fromMaybe ""
. getText
) t
| otherwise
= [t]
where
parser :: Parser String
parser = do
_ <- option [] textDecl
getInput
parseXmlPart :: Parser XmlTrees -> String -> String -> XmlTree -> XmlTrees
parseXmlPart parser expected context t
= parseXmlText ( do
res <- parser
eof <?> expected
return res
) context
$ t
parseXmlDTDPart :: String -> XmlTree -> XmlTrees
parseXmlDTDPart
= parseXmlPart markupOrDeclSep "markup declaration"
parseXmlGeneralEntityValue :: String -> XmlTree -> XmlTrees
parseXmlGeneralEntityValue
= parseXmlPart content "general entity value"
parseXmlAttrValue :: String -> XmlTree -> XmlTrees
parseXmlAttrValue
= parseXmlPart (XT.attrValueT' "<&") "attribute value"
parseNMToken :: String -> XmlTree -> XmlTrees
parseNMToken
= parseXmlPart (many1 XT.nmtokenT) "nmtoken"
parseName :: String -> XmlTree -> XmlTrees
parseName
= parseXmlPart (many1 XT.nameT) "name"
parseXmlEncodingSpec :: Parser XmlTree -> XmlTree -> XmlTrees
parseXmlEncodingSpec encDecl x
= (:[]) .
( if isRoot x
then parseEncSpec
else id
) $ x
where
parseEncSpec r
= case ( parse encDecl source . xshow . getChildren $ r ) of
Right t
-> changeAttrl (mergeAttrl . fromMaybe [] . getAttrl $ t) r
Left _
-> r
where
source = xshow . concat . map getChildren . filter ((== a_source) . maybe "" qualifiedName . getAttrName) . fromMaybe [] . getAttrl $ r
parseXmlEntityEncodingSpec :: XmlTree -> XmlTrees
parseXmlEntityEncodingSpec = parseXmlEncodingSpec textDecl''
parseXmlDocEncodingSpec :: XmlTree -> XmlTrees
parseXmlDocEncodingSpec = parseXmlEncodingSpec xMLDecl''