module Text.XML.HXT.Arrow.DocumentInput
( getURIContents
, getXmlContents
, getXmlEntityContents
, getEncoding
, getTextEncoding
, decodeDocument
)
where
import Control.Arrow
import Control.Arrow.ArrowList
import Control.Arrow.ArrowIf
import Control.Arrow.ArrowTree
import Control.Arrow.ArrowIO
import Control.Arrow.ListArrow
import Data.List ( isPrefixOf )
import System.FilePath ( takeExtension )
import Text.XML.HXT.DOM.Unicode ( getDecodingFct
, guessEncoding
, normalizeNL
)
import qualified Text.XML.HXT.IO.GetFILE as FILE
import qualified Text.XML.HXT.IO.GetHTTPLibCurl as LibCURL
import Text.XML.HXT.DOM.Interface
import Text.XML.HXT.Arrow.ParserInterface ( parseXmlDocEncodingSpec
, parseXmlEntityEncodingSpec
, removeEncodingSpec
)
import Text.XML.HXT.Arrow.XmlArrow
import Text.XML.HXT.Arrow.XmlIOStateArrow
protocolHandlers :: AssocList String (IOStateArrow s XmlTree XmlTree)
protocolHandlers
= [ ("file", getFileContents)
, ("http", getHttpContents)
, ("stdin", getStdinContents)
]
getProtocolHandler :: IOStateArrow s String (IOStateArrow s XmlTree XmlTree)
getProtocolHandler
= arr (\ s -> lookupDef getUnsupported s protocolHandlers)
getUnsupported :: IOStateArrow s XmlTree XmlTree
getUnsupported
= perform ( getAttrValue a_source
>>>
arr (("unsupported protocol in URI " ++) . show)
>>>
applyA (arr issueFatal)
)
>>>
setDocumentStatusFromSystemState "accessing documents"
getStringContents :: IOStateArrow s XmlTree XmlTree
getStringContents
= setCont $< getAttrValue a_source
>>>
addAttr transferMessage "OK"
>>>
addAttr transferStatus "200"
where
setCont contents
= replaceChildren (txt contents')
>>>
addAttr transferURI (take 7 contents)
>>>
addAttr a_source (show . prefix 48 $ contents')
where
contents' = drop (length stringProtocol) contents
prefix l s
| length s' > l = take (l 3) s' ++ "..."
| otherwise = s'
where
s' = take (l + 1) s
getFileContents :: IOStateArrow s XmlTree XmlTree
getFileContents
= applyA ( ( ( getAttrValue a_strict_input
>>>
arr isTrueValue
)
&&&
( getAttrValue transferURI
>>>
getPathFromURI
)
)
>>>
traceValue 2 (\ (b, f) -> "read file " ++ show f ++ " (strict input = " ++ show b ++ ")")
>>>
arrIO (uncurry FILE.getCont)
>>>
( arr (uncurry addError)
|||
arr addTxtContent
)
)
>>>
addMimeType
getStdinContents :: IOStateArrow s XmlTree XmlTree
getStdinContents
= applyA ( getAttrValue a_strict_input
>>>
arr isTrueValue
>>>
arrIO FILE.getStdinCont
>>>
( arr (uncurry addError)
|||
arr addTxtContent
)
)
addError :: [(String, String)] -> String -> IOStateArrow s XmlTree XmlTree
addError al e
= issueFatal e
>>>
seqA (map (uncurry addAttr) al)
>>>
setDocumentStatusFromSystemState "accessing documents"
addMimeType :: IOStateArrow s XmlTree XmlTree
addMimeType
= addMime $< ( getAttrValue transferURI
>>>
( uriToMime $< getSysParam xio_mimeTypes )
)
where
addMime mt
= addAttr transferMimeType mt
uriToMime mtt
= arr $ ( \ uri -> extensionToMimeType (drop 1 . takeExtension $ uri) mtt )
addTxtContent :: String -> IOStateArrow s XmlTree XmlTree
addTxtContent c
= replaceChildren (txt c)
>>>
addAttr transferMessage "OK"
>>>
addAttr transferStatus "200"
getHttpContents :: IOStateArrow s XmlTree XmlTree
getHttpContents
= getCont $<< ( getAttrValue transferURI
&&&
( ( getAttrlAsAssoc
&&&
getAllParamsString
)
>>^ uncurry addEntries
)
)
where
getAttrlAsAssoc
= listA ( getAttrl
>>> ( getName
&&&
xshow getChildren
)
)
getCont uri options
= applyA ( ( traceMsg 2 ( "get HTTP via libcurl, uri=" ++ show uri ++ " options=" ++ show options )
>>>
arrIO0 ( LibCURL.getCont options uri )
)
>>>
( arr (uncurry addError)
|||
arr addContent
)
)
addContent :: (AssocList String String, String) -> IOStateArrow s XmlTree XmlTree
addContent (al, c)
= replaceChildren (txt c)
>>>
seqA (map (uncurry addAttr) al)
getURIContents :: IOStateArrow s XmlTree XmlTree
getURIContents
= getContentsFromString
`orElse`
getContentsFromDoc
where
getContentsFromString
= ( getAttrValue a_source
>>>
isA (isPrefixOf stringProtocol)
)
`guards`
getStringContents
getContentsFromDoc
= ( ( addTransferURI $< getBaseURI
>>>
getCont
)
`when`
( setAbsURI $< ( getAttrValue a_source
>>^
( \ src-> (if null src then "stdin:" else src) )
)
)
)
>>>
setDocumentStatusFromSystemState "getURIContents"
setAbsURI src
= ifA ( constA src >>> changeBaseURI )
this
( issueFatal ("illegal URI : " ++ show src) )
addTransferURI uri
= addAttr transferURI uri
getCont
= applyA ( getBaseURI
>>>
traceValue 2 (("getURIContents: reading " ++) . show)
>>>
getSchemeFromURI
>>>
getProtocolHandler
)
`orElse`
this
setBaseURIFromDoc :: IOStateArrow s XmlTree XmlTree
setBaseURIFromDoc
= perform ( getAttrValue transferURI
>>>
isA (isPrefixOf stringProtocol)
>>>
setBaseURI
)
getXmlContents :: IOStateArrow s XmlTree XmlTree
getXmlContents
= getXmlContents' parseXmlDocEncodingSpec
>>>
setBaseURIFromDoc
getXmlEntityContents :: IOStateArrow s XmlTree XmlTree
getXmlEntityContents
= getXmlContents' parseXmlEntityEncodingSpec
>>>
processChildren removeEncodingSpec
>>>
setBaseURIFromDoc
getXmlContents' :: IOStateArrow s XmlTree XmlTree -> IOStateArrow s XmlTree XmlTree
getXmlContents' parseEncodingSpec
= ( getURIContents
>>>
choiceA
[ isXmlHtmlDoc :-> ( parseEncodingSpec
>>>
filterErrorMsg
>>>
decodeDocument
)
, isTextDoc :-> decodeDocument
, this :-> this
]
>>>
perform ( getAttrValue transferURI
>>>
traceValue 1 (("getXmlContents: content read and decoded for " ++) . show)
)
>>>
traceTree
>>>
traceSource
)
`when`
isRoot
isMimeDoc :: (String -> Bool) -> IOStateArrow s XmlTree XmlTree
isMimeDoc isMT = fromLA $
( ( getAttrValue transferMimeType >>^ stringToLower )
>>>
isA (\ t -> null t || isMT t)
)
`guards` this
isTextDoc, isXmlHtmlDoc :: IOStateArrow s XmlTree XmlTree
isTextDoc = isMimeDoc isTextMimeType
isXmlHtmlDoc = isMimeDoc (\ mt -> isHtmlMimeType mt || isXmlMimeType mt)
getEncoding :: IOStateArrow s XmlTree String
getEncoding
= catA [ xshow getChildren
>>>
arr guessEncoding
, getAttrValue transferEncoding
, getAttrValue a_encoding
, getParamString a_encoding
, constA utf8
]
>. (head . filter (not . null))
getTextEncoding :: IOStateArrow s XmlTree String
getTextEncoding
= catA [ getAttrValue transferEncoding
, getAttrValue a_encoding
, getParamString a_encoding
, constA isoLatin1
]
>. (head . filter (not . null))
decodeDocument :: IOStateArrow s XmlTree XmlTree
decodeDocument
= choiceA
[ ( isRoot >>> isXmlHtmlDoc ) :-> ( decodeArr normalizeNL $< getEncoding )
, ( isRoot >>> isTextDoc ) :-> ( decodeArr id $< getTextEncoding )
, this :-> this
]
where
decodeArr :: (String -> String) -> String -> IOStateArrow s XmlTree XmlTree
decodeArr normalizeNewline enc
= maybe notFound found . getDecodingFct $ enc
where
found df
= traceMsg 2 ("decodeDocument: encoding is " ++ show enc)
>>>
( decodeText df $< getAttrValue a_ignore_encoding_errors )
>>>
addAttr transferEncoding enc
notFound
= issueFatal ("encoding scheme not supported: " ++ show enc)
>>>
setDocumentStatusFromSystemState "decoding document"
decodeText df ignoreErrs
= processChildren
( getText
>>> arr df
>>> ( ( (fst >>> normalizeNewline)
^>> mkText
)
<+>
( if isTrueValue ignoreErrs
then none
else
( arrL snd
>>>
arr ((enc ++) . (" encoding error" ++))
>>>
applyA (arr issueErr)
>>>
none
)
)
)
)