module Text.XML.HXT.Arrow.DocumentInput
( getXmlContents
, getXmlEntityContents
, getEncoding
, getTextEncoding
, decodeDocument
, addInputError
)
where
import Control.Arrow
import Control.Arrow.ArrowIf
import Control.Arrow.ArrowIO
import Control.Arrow.ArrowList
import Control.Arrow.ArrowTree
import Control.Arrow.ListArrow
import Data.List (isPrefixOf)
import Data.String.Unicode (getDecodingFct,
guessEncoding,
normalizeNL)
import System.FilePath (takeExtension)
import qualified Text.XML.HXT.IO.GetFILE as FILE
import Text.XML.HXT.DOM.Interface
import Text.XML.HXT.Arrow.ParserInterface (parseXmlDocEncodingSpec, parseXmlEntityEncodingSpec,
removeEncodingSpec)
import Text.XML.HXT.Arrow.XmlArrow
import Text.XML.HXT.Arrow.XmlState
import Text.XML.HXT.Arrow.XmlState.TypeDefs
protocolHandlers :: AssocList String (IOStateArrow s XmlTree XmlTree)
protocolHandlers
= [ ("file", getFileContents)
, ("http", getHttpContents)
, ("https", getHttpContents)
, ("stdin", getStdinContents)
]
getProtocolHandler :: IOStateArrow s String (IOStateArrow s XmlTree XmlTree)
getProtocolHandler
= arr (\ s -> lookupDef getUnsupported s protocolHandlers)
getUnsupported :: IOStateArrow s XmlTree XmlTree
getUnsupported
= perform ( getAttrValue a_source
>>>
arr (("unsupported protocol in URI " ++) . show)
>>>
applyA (arr issueFatal)
)
>>>
setDocumentStatusFromSystemState "accessing documents"
getStringContents :: IOStateArrow s XmlTree XmlTree
getStringContents
= setCont $< getAttrValue a_source
>>>
addAttr transferMessage "OK"
>>>
addAttr transferStatus "200"
where
setCont contents
= replaceChildren (txt contents')
>>>
addAttr transferURI (take 7 contents)
>>>
addAttr a_source (show . prefix 48 $ contents')
where
contents' = drop (length stringProtocol) contents
prefix l s
| length s' > l = take (l 3) s' ++ "..."
| otherwise = s'
where
s' = take (l + 1) s
getFileContents :: IOStateArrow s XmlTree XmlTree
getFileContents
= applyA ( ( getSysVar theStrictInput
&&&
( getAttrValue transferURI
>>>
getPathFromURI
)
)
>>>
traceValue 2 (\ (b, f) -> "read file " ++ show f ++ " (strict input = " ++ show b ++ ")")
>>>
arrIO (uncurry FILE.getCont)
>>>
( arr (uncurry addInputError)
|||
arr addTxtContent
)
)
>>>
addMimeType
getStdinContents :: IOStateArrow s XmlTree XmlTree
getStdinContents
= applyA ( getSysVar theStrictInput
>>>
arrIO FILE.getStdinCont
>>>
( arr (uncurry addInputError)
|||
arr addTxtContent
)
)
addInputError :: Attributes -> String -> IOStateArrow s XmlTree XmlTree
addInputError al e
= issueFatal e
>>>
seqA (map (uncurry addAttr) al)
>>>
setDocumentStatusFromSystemState "accessing documents"
addMimeType :: IOStateArrow s XmlTree XmlTree
addMimeType
= addMime $< ( ( getSysVar theFileMimeType
>>>
isA (not . null)
)
`orElse`
( getAttrValue transferURI
>>>
( uriToMime $< getMimeTypeTable )
)
)
where
addMime mt
= addAttr transferMimeType mt
uriToMime mtt
= arr $ ( \ uri -> extensionToMimeType (drop 1 . takeExtension $ uri) mtt )
addTxtContent :: Blob -> IOStateArrow s XmlTree XmlTree
addTxtContent bc
= replaceChildren (blb bc)
>>>
addAttr transferMessage "OK"
>>>
addAttr transferStatus "200"
getHttpContents :: IOStateArrow s XmlTree XmlTree
getHttpContents
= withoutUserState $ applyA $ getSysVar theHttpHandler
getContentsFromString :: IOStateArrow s XmlTree XmlTree
getContentsFromString
= ( getAttrValue a_source
>>>
isA (isPrefixOf stringProtocol)
)
`guards`
getStringContents
getContentsFromDoc :: IOStateArrow s XmlTree XmlTree
getContentsFromDoc
= ( ( addTransferURI $< getBaseURI
>>>
getCont
)
`when`
( setAbsURI $< ( getAttrValue a_source
>>^
( \ src-> (if null src then "stdin:" else src) )
)
)
)
>>>
setDocumentStatusFromSystemState "getContentsFromDoc"
where
setAbsURI src
= ifA ( constA src >>> changeBaseURI )
this
( issueFatal ("illegal URI : " ++ show src) )
addTransferURI uri
= addAttr transferURI uri
getCont
= applyA ( getBaseURI
>>>
traceValue 2 (("getContentsFromDoc: reading " ++) . show)
>>>
getSchemeFromURI
>>>
getProtocolHandler
)
`orElse`
this
setBaseURIFromDoc :: IOStateArrow s XmlTree XmlTree
setBaseURIFromDoc
= perform ( getAttrValue transferURI
>>>
isA (isPrefixOf stringProtocol)
>>>
setBaseURI
)
getXmlContents :: IOStateArrow s XmlTree XmlTree
getXmlContents
= getXmlContents' parseXmlDocEncodingSpec
>>>
setBaseURIFromDoc
getXmlEntityContents :: IOStateArrow s XmlTree XmlTree
getXmlEntityContents
= traceMsg 2 "getXmlEntityContents"
>>>
addAttr transferMimeType text_xml_external_parsed_entity
>>>
getXmlContents' parseXmlEntityEncodingSpec
>>>
addAttr transferMimeType text_xml_external_parsed_entity
>>>
processChildren
( removeEncodingSpec
>>>
changeText normalizeNL
)
>>>
setBaseURIFromDoc
>>>
traceMsg 2 "getXmlEntityContents done"
getXmlContents' :: IOStateArrow s XmlTree XmlTree -> IOStateArrow s XmlTree XmlTree
getXmlContents' parseEncodingSpec
= ( getContentsFromString
`orElse`
( getContentsFromDoc
>>>
choiceA
[ isXmlHtmlDoc :-> ( parseEncodingSpec
>>>
filterErrorMsg
>>>
decodeDocument
)
, isTextDoc :-> decodeDocument
, this :-> this
]
>>>
perform ( getAttrValue transferURI
>>>
traceValue 1 (("getXmlContents: content read and decoded for " ++) . show)
)
>>>
traceDoc "getXmlContents'"
)
)
`when`
isRoot
isMimeDoc :: (String -> Bool) -> IOStateArrow s XmlTree XmlTree
isMimeDoc isMT = fromLA $
( ( getAttrValue transferMimeType >>^ stringToLower )
>>>
isA (\ t -> null t || isMT t)
)
`guards` this
isTextDoc, isXmlHtmlDoc :: IOStateArrow s XmlTree XmlTree
isTextDoc = isMimeDoc isTextMimeType
isXmlHtmlDoc = isMimeDoc (\ mt -> isHtmlMimeType mt || isXmlMimeType mt)
getEncoding :: IOStateArrow s XmlTree String
getEncoding
= catA [ xshow getChildren
>>>
arr guessEncoding
, getAttrValue transferEncoding
, getAttrValue a_encoding
, getSysVar theInputEncoding
, constA utf8
]
>. (head . filter (not . null))
getTextEncoding :: IOStateArrow s XmlTree String
getTextEncoding
= catA [ getAttrValue transferEncoding
, getAttrValue a_encoding
, getSysVar theInputEncoding
, constA isoLatin1
]
>. (head . filter (not . null))
decodeDocument :: IOStateArrow s XmlTree XmlTree
decodeDocument
= choiceA
[ ( isRoot >>> isXmlHtmlDoc ) :-> ( decodeX $< getSysVar theExpat)
, ( isRoot >>> isTextDoc ) :-> ( decodeArr $< getTextEncoding )
, this :-> this
]
where
decodeX :: Bool -> IOStateArrow s XmlTree XmlTree
decodeX False = decodeArr $< getEncoding
decodeX True = noDecode $< getEncoding
noDecode enc = traceMsg 2 ("no decoding (done by expat): encoding is " ++ show enc)
>>>
addAttr transferEncoding enc
decodeArr :: String -> IOStateArrow s XmlTree XmlTree
decodeArr enc
= maybe notFound found . getDecodingFct $ enc
where
found df
= traceMsg 2 ("decodeDocument: encoding is " ++ show enc)
>>>
( decodeText df $< getSysVar theEncodingErrors )
>>>
addAttr transferEncoding enc
notFound
= issueFatal ("encoding scheme not supported: " ++ show enc)
>>>
setDocumentStatusFromSystemState "decoding document"
decodeText df withEncErrors
= processChildren
( getText
>>> arr df
>>> ( ( fst ^>> mkText )
<+>
( if withEncErrors
then
( arrL snd
>>>
arr ((enc ++) . (" encoding error" ++))
>>>
applyA (arr issueErr)
>>>
none
)
else none
)
)
)