module Text.XML.HXT.Arrow.ProcessDocument
( parseXmlDocument
, parseXmlDocumentWithExpat
, parseHtmlDocument
, validateDocument
, propagateAndValidateNamespaces
, andValidateNamespaces
, getDocumentContents
)
where
import Control.Arrow
import Control.Arrow.ArrowIf
import Control.Arrow.ArrowList
import Control.Arrow.ArrowTree
import Control.Arrow.ListArrow (fromLA)
import Control.Arrow.NTreeEdit
import Text.XML.HXT.DOM.Interface
import Text.XML.HXT.Arrow.XmlArrow
import Text.XML.HXT.Arrow.XmlState
import Text.XML.HXT.Arrow.XmlState.TypeDefs
import Text.XML.HXT.Arrow.ParserInterface (parseHtmlDoc,
parseXmlDoc)
import Text.XML.HXT.Arrow.Edit (substAllXHTMLEntityRefs,
transfAllCharRef)
import Text.XML.HXT.Arrow.GeneralEntitySubstitution (processGeneralEntities)
import Text.XML.HXT.Arrow.DTDProcessing (processDTD)
import Text.XML.HXT.Arrow.DocumentInput (getXmlContents)
import Text.XML.HXT.Arrow.Namespace (propagateNamespaces, validateNamespaces)
import Text.XML.HXT.DTDValidation.Validation (generalEntitiesDefined,
getDTDSubset,
transform,
validate)
parseXmlDocument :: Bool -> Bool -> Bool -> Bool -> IOStateArrow s XmlTree XmlTree
parseXmlDocument validateD substDTD substHTML validateRX
= ( replaceChildren ( ( getAttrValue a_source
&&&
xshow getChildren
)
>>>
parseXmlDoc
>>>
filterErrorMsg
)
>>>
setDocumentStatusFromSystemState "parse XML document"
>>>
( ifA (fromLA getDTDSubset)
( processDTDandEntities
>>>
( if validate'
then validateDocument
else this
)
)
( if validate'
then traceMsg 2 "checkUndefinedEntityRefs: looking for undefined entity refs"
>>>
perform checkUndefinedEntityRefs
>>>
traceMsg 2 "checkUndefinedEntityRefs: looking for undefined entity refs done"
>>>
setDocumentStatusFromSystemState "decoding document"
else this
)
)
)
`when` documentStatusOk
where
validate'
= validateD && not validateRX
processDTDandEntities
= ( if validateD || substDTD
then processDTD
else this
)
>>>
( if substDTD
then ( processGeneralEntities
`when`
fromLA generalEntitiesDefined
)
else if substHTML
then substAllXHTMLEntityRefs
else this
)
>>>
transfAllCharRef
checkUndefinedEntityRefs :: IOStateArrow s XmlTree XmlTree
checkUndefinedEntityRefs
= deep isEntityRef
>>>
getEntityRef
>>>
arr (\ en -> "general entity reference \"&" ++ en ++ ";\" is undefined")
>>>
mkError c_err
>>>
filterErrorMsg
parseXmlDocumentWithExpat :: IOStateArrow s XmlTree XmlTree
parseXmlDocumentWithExpat
= ( withoutUserState $< getSysVar theExpatParser
)
`when` documentStatusOk
parseHtmlDocument :: IOStateArrow s XmlTree XmlTree
parseHtmlDocument
= ( perform ( getAttrValue a_source
>>>
traceValue 1 (("parseHtmlDoc: parse HTML document " ++) . show)
)
>>>
( parseHtml $< getSysVar (theTagSoup .&&&. theExpat) )
>>>
( removeWarnings $< getSysVar (theWarnings .&&&. theTagSoup) )
>>>
setDocumentStatusFromSystemState "parse HTML document"
>>>
traceTree
>>>
traceSource
>>>
perform ( getAttrValue a_source
>>>
traceValue 1 (\ src -> "parse HTML document " ++ show src ++ " finished")
)
)
`when` documentStatusOk
where
parseHtml (withTagSoup', withExpat')
| withExpat' = withoutUserState $< getSysVar theExpatParser
| withTagSoup' = withoutUserState $< getSysVar theTagSoupParser
| otherwise = traceMsg 1 ("parse document with parsec HTML parser")
>>>
replaceChildren
( ( getAttrValue a_source
&&&
xshow getChildren
)
>>>
parseHtmlDoc
)
removeWarnings (warnings, withTagSoup')
| warnings = processTopDownWithAttrl
filterErrorMsg
| withTagSoup' = this
| otherwise = fromLA $
editNTreeA [isError :-> none]
validateDocument :: IOStateArrow s XmlTree XmlTree
validateDocument
= ( traceMsg 1 "validating document"
>>>
perform ( validateDoc
>>>
filterErrorMsg
)
>>>
setDocumentStatusFromSystemState "document validation"
>>>
traceMsg 1 "document validated, transforming doc with respect to DTD"
>>>
transformDoc
>>>
traceMsg 1 "document transformed"
>>>
traceSource
>>>
traceTree
)
`when`
documentStatusOk
propagateAndValidateNamespaces :: IOStateArrow s XmlTree XmlTree
propagateAndValidateNamespaces
= ( traceMsg 1 "propagating namespaces"
>>>
propagateNamespaces
>>>
traceDoc "propagating namespaces done"
>>>
andValidateNamespaces
)
`when`
documentStatusOk
andValidateNamespaces :: IOStateArrow s XmlTree XmlTree
andValidateNamespaces
= ( traceMsg 1 "validating namespaces"
>>>
( setDocumentStatusFromSystemState "namespace propagation"
`when`
( validateNamespaces >>> perform filterErrorMsg )
)
>>>
traceMsg 1 "namespace validation finished"
)
`when`
documentStatusOk
getDocumentContents :: String -> IOStateArrow s b XmlTree
getDocumentContents src
= root [] []
>>>
addAttr a_source src
>>>
traceMsg 1 ("readDocument: start processing document " ++ show src)
>>>
getXmlContents
validateDoc :: ArrowList a => a XmlTree XmlTree
validateDoc = fromLA ( validate
`when`
getDTDSubset
)
transformDoc :: ArrowList a => a XmlTree XmlTree
transformDoc = fromLA transform