module Text.XML.HXT.Arrow.DocumentInput
( getXmlContents
, getXmlEntityContents
, getEncoding
, getTextEncoding
, decodeDocument
, addInputError
)
where
import Control.Arrow
import Control.Arrow.ArrowList
import Control.Arrow.ArrowIf
import Control.Arrow.ArrowTree
import Control.Arrow.ArrowIO
import Control.Arrow.ListArrow
import Data.List ( isPrefixOf )
import Data.String.Unicode ( getDecodingFct
, guessEncoding
, normalizeNL
)
import System.FilePath ( takeExtension )
import qualified Text.XML.HXT.IO.GetFILE as FILE
import Text.XML.HXT.DOM.Interface
import Text.XML.HXT.Arrow.ParserInterface
( parseXmlDocEncodingSpec
, parseXmlEntityEncodingSpec
, removeEncodingSpec
)
import Text.XML.HXT.Arrow.XmlArrow
import Text.XML.HXT.Arrow.XmlState
import Text.XML.HXT.Arrow.XmlState.TypeDefs
protocolHandlers :: AssocList String (IOStateArrow s XmlTree XmlTree)
protocolHandlers
= [ ("file", getFileContents)
, ("http", getHttpContents)
, ("https", getHttpContents)
, ("stdin", getStdinContents)
]
getProtocolHandler :: IOStateArrow s String (IOStateArrow s XmlTree XmlTree)
getProtocolHandler
= arr (\ s -> lookupDef getUnsupported s protocolHandlers)
getUnsupported :: IOStateArrow s XmlTree XmlTree
getUnsupported
= perform ( getAttrValue a_source
>>>
arr (("unsupported protocol in URI " ++) . show)
>>>
applyA (arr issueFatal)
)
>>>
setDocumentStatusFromSystemState "accessing documents"
getStringContents :: IOStateArrow s XmlTree XmlTree
getStringContents
= setCont $< getAttrValue a_source
>>>
addAttr transferMessage "OK"
>>>
addAttr transferStatus "200"
where
setCont contents
= replaceChildren (txt contents')
>>>
addAttr transferURI (take 7 contents)
>>>
addAttr a_source (show . prefix 48 $ contents')
where
contents' = drop (length stringProtocol) contents
prefix l s
| length s' > l = take (l 3) s' ++ "..."
| otherwise = s'
where
s' = take (l + 1) s
getFileContents :: IOStateArrow s XmlTree XmlTree
getFileContents
= applyA ( ( getSysVar theStrictInput
&&&
( getAttrValue transferURI
>>>
getPathFromURI
)
)
>>>
traceValue 2 (\ (b, f) -> "read file " ++ show f ++ " (strict input = " ++ show b ++ ")")
>>>
arrIO (uncurry FILE.getCont)
>>>
( arr (uncurry addInputError)
|||
arr addTxtContent
)
)
>>>
addMimeType
getStdinContents :: IOStateArrow s XmlTree XmlTree
getStdinContents
= applyA ( getSysVar theStrictInput
>>>
arrIO FILE.getStdinCont
>>>
( arr (uncurry addInputError)
|||
arr addTxtContent
)
)
addInputError :: Attributes -> String -> IOStateArrow s XmlTree XmlTree
addInputError al e
= issueFatal e
>>>
seqA (map (uncurry addAttr) al)
>>>
setDocumentStatusFromSystemState "accessing documents"
addMimeType :: IOStateArrow s XmlTree XmlTree
addMimeType
= addMime $< ( ( getSysVar theFileMimeType
>>>
isA (not . null)
)
`orElse`
( getAttrValue transferURI
>>>
( uriToMime $< getMimeTypeTable )
)
)
where
addMime mt
= addAttr transferMimeType mt
uriToMime mtt
= arr $ ( \ uri -> extensionToMimeType (drop 1 . takeExtension $ uri) mtt )
addTxtContent :: Blob -> IOStateArrow s XmlTree XmlTree
addTxtContent bc
= replaceChildren (blb bc)
>>>
addAttr transferMessage "OK"
>>>
addAttr transferStatus "200"
getHttpContents :: IOStateArrow s XmlTree XmlTree
getHttpContents
= withoutUserState $ applyA $ getSysVar theHttpHandler
getContentsFromString :: IOStateArrow s XmlTree XmlTree
getContentsFromString
= ( getAttrValue a_source
>>>
isA (isPrefixOf stringProtocol)
)
`guards`
getStringContents
getContentsFromDoc :: IOStateArrow s XmlTree XmlTree
getContentsFromDoc
= ( ( addTransferURI $< getBaseURI
>>>
getCont
)
`when`
( setAbsURI $< ( getAttrValue a_source
>>^
( \ src-> (if null src then "stdin:" else src) )
)
)
)
>>>
setDocumentStatusFromSystemState "getContentsFromDoc"
where
setAbsURI src
= ifA ( constA src >>> changeBaseURI )
this
( issueFatal ("illegal URI : " ++ show src) )
addTransferURI uri
= addAttr transferURI uri
getCont
= applyA ( getBaseURI
>>>
traceValue 2 (("getContentsFromDoc: reading " ++) . show)
>>>
getSchemeFromURI
>>>
getProtocolHandler
)
`orElse`
this
setBaseURIFromDoc :: IOStateArrow s XmlTree XmlTree
setBaseURIFromDoc
= perform ( getAttrValue transferURI
>>>
isA (isPrefixOf stringProtocol)
>>>
setBaseURI
)
getXmlContents :: IOStateArrow s XmlTree XmlTree
getXmlContents
= getXmlContents' parseXmlDocEncodingSpec
>>>
setBaseURIFromDoc
getXmlEntityContents :: IOStateArrow s XmlTree XmlTree
getXmlEntityContents
= traceMsg 2 "getXmlEntityContents"
>>>
addAttr transferMimeType text_xml_external_parsed_entity
>>>
getXmlContents' parseXmlEntityEncodingSpec
>>>
addAttr transferMimeType text_xml_external_parsed_entity
>>>
processChildren
( removeEncodingSpec
>>>
changeText normalizeNL
)
>>>
setBaseURIFromDoc
>>>
traceMsg 2 "getXmlEntityContents done"
getXmlContents' :: IOStateArrow s XmlTree XmlTree -> IOStateArrow s XmlTree XmlTree
getXmlContents' parseEncodingSpec
= ( getContentsFromString
`orElse`
( getContentsFromDoc
>>>
choiceA
[ isXmlHtmlDoc :-> ( parseEncodingSpec
>>>
filterErrorMsg
>>>
decodeDocument
)
, isTextDoc :-> decodeDocument
, this :-> this
]
>>>
perform ( getAttrValue transferURI
>>>
traceValue 1 (("getXmlContents: content read and decoded for " ++) . show)
)
>>>
traceDoc "getXmlContents'"
)
)
`when`
isRoot
isMimeDoc :: (String -> Bool) -> IOStateArrow s XmlTree XmlTree
isMimeDoc isMT = fromLA $
( ( getAttrValue transferMimeType >>^ stringToLower )
>>>
isA (\ t -> null t || isMT t)
)
`guards` this
isTextDoc, isXmlHtmlDoc :: IOStateArrow s XmlTree XmlTree
isTextDoc = isMimeDoc isTextMimeType
isXmlHtmlDoc = isMimeDoc (\ mt -> isHtmlMimeType mt || isXmlMimeType mt)
getEncoding :: IOStateArrow s XmlTree String
getEncoding
= catA [ xshow getChildren
>>>
arr guessEncoding
, getAttrValue transferEncoding
, getAttrValue a_encoding
, getSysVar theInputEncoding
, constA utf8
]
>. (head . filter (not . null))
getTextEncoding :: IOStateArrow s XmlTree String
getTextEncoding
= catA [ getAttrValue transferEncoding
, getAttrValue a_encoding
, getSysVar theInputEncoding
, constA isoLatin1
]
>. (head . filter (not . null))
decodeDocument :: IOStateArrow s XmlTree XmlTree
decodeDocument
= choiceA
[ ( isRoot >>> isXmlHtmlDoc ) :-> ( decodeX $< getSysVar theExpat)
, ( isRoot >>> isTextDoc ) :-> ( decodeArr $< getTextEncoding )
, this :-> this
]
where
decodeX :: Bool -> IOStateArrow s XmlTree XmlTree
decodeX False = decodeArr $< getEncoding
decodeX True = noDecode $< getEncoding
noDecode enc = traceMsg 2 ("no decoding (done by expat): encoding is " ++ show enc)
>>>
addAttr transferEncoding enc
decodeArr :: String -> IOStateArrow s XmlTree XmlTree
decodeArr enc
= maybe notFound found . getDecodingFct $ enc
where
found df
= traceMsg 2 ("decodeDocument: encoding is " ++ show enc)
>>>
( decodeText df $< getSysVar theEncodingErrors )
>>>
addAttr transferEncoding enc
notFound
= issueFatal ("encoding scheme not supported: " ++ show enc)
>>>
setDocumentStatusFromSystemState "decoding document"
decodeText df withEncErrors
= processChildren
( getText
>>> arr df
>>> ( ( fst ^>> mkText )
<+>
( if withEncErrors
then
( arrL snd
>>>
arr ((enc ++) . (" encoding error" ++))
>>>
applyA (arr issueErr)
>>>
none
)
else none
)
)
)