module Text.XML.HXT.Arrow.Edit
( canonicalizeAllNodes
, canonicalizeForXPath
, canonicalizeContents
, collapseAllXText
, collapseXText
, xshowEscapeXml
, escapeXmlDoc
, escapeHtmlDoc
, haskellRepOfXmlDoc
, treeRepOfXmlDoc
, addHeadlineToXmlDoc
, indentDoc
, numberLinesInXmlDoc
, preventEmptyElements
, removeComment
, removeAllComment
, removeWhiteSpace
, removeAllWhiteSpace
, removeDocWhiteSpace
, transfCdata
, transfAllCdata
, transfCharRef
, transfAllCharRef
, rememberDTDAttrl
, addDefaultDTDecl
, hasXmlPi
, addXmlPi
, addXmlPiEncoding
, addDoctypeDecl
, addXHtmlDoctypeStrict
, addXHtmlDoctypeTransitional
, addXHtmlDoctypeFrameset
)
where
import Control.Arrow
import Control.Arrow.ArrowList
import Control.Arrow.ArrowIf
import Control.Arrow.ArrowTree
import Control.Arrow.ListArrow
import Text.XML.HXT.Arrow.XmlArrow
import Text.XML.HXT.DOM.Interface
import qualified Text.XML.HXT.DOM.XmlNode as XN
import Text.XML.HXT.DOM.Unicode ( isXmlSpaceChar )
import Text.XML.HXT.DOM.FormatXmlTree ( formatXmlTree )
import Text.XML.HXT.Parser.HtmlParsec ( emptyHtmlTags )
import Text.XML.HXT.Parser.XmlEntities ( xmlEntities )
import Text.XML.HXT.Parser.XhtmlEntities ( xhtmlEntities )
import Data.List ( isPrefixOf )
import qualified Data.Map as M
import Data.Maybe
canonicalizeTree' :: LA XmlTree XmlTree -> LA XmlTree XmlTree
canonicalizeTree' toBeRemoved
= processChildren (none `when` isText)
>>>
processBottomUp canonicalize1Node
where
canonicalize1Node :: LA XmlTree XmlTree
canonicalize1Node
= (deep isPi `when` isDTD)
>>>
(none `when` toBeRemoved)
>>>
( processAttrl ( processChildren transfCharRef
>>>
collapseXText
)
`when` isElem
)
>>>
transfCdata
>>>
transfCharRef
>>>
collapseXText
canonicalizeAllNodes :: ArrowList a => a XmlTree XmlTree
canonicalizeAllNodes
= fromLA $
canonicalizeTree' ( isCmt
<+>
isXmlPi
)
canonicalizeForXPath :: ArrowList a => a XmlTree XmlTree
canonicalizeForXPath
= fromLA $
canonicalizeTree' isXmlPi
canonicalizeContents :: ArrowList a => a XmlTree XmlTree
canonicalizeContents
= fromLA $
processBottomUp canonicalize1Node
where
canonicalize1Node :: LA XmlTree XmlTree
canonicalize1Node
= ( processAttrl ( processChildren transfCharRef
>>>
collapseXText
)
`when` isElem
)
>>>
transfCdata
>>>
transfCharRef
>>>
collapseXText
collapseXText' :: LA XmlTree XmlTree
collapseXText'
= replaceChildren ( listA getChildren >>> arrL (foldr mergeText' []) )
where
mergeText' :: XmlTree -> XmlTrees -> XmlTrees
mergeText' t1 (t2 : ts2)
| XN.isText t1 && XN.isText t2
= let
s1 = fromJust . XN.getText $ t1
s2 = fromJust . XN.getText $ t2
t = XN.mkText (s1 ++ s2)
in
t : ts2
mergeText' t1 ts
= t1 : ts
collapseXText :: ArrowList a => a XmlTree XmlTree
collapseXText
= fromLA $
collapseXText'
collapseAllXText :: ArrowList a => a XmlTree XmlTree
collapseAllXText
= fromLA $
processBottomUp collapseXText'
xshowEscapeXml :: ArrowXml a => a n XmlTree -> a n String
xshowEscapeXml f = xshow (f >>> escapeXmlDoc)
type EntityRefTable = M.Map Int String
xmlEntityRefTable
, xhtmlEntityRefTable :: EntityRefTable
xmlEntityRefTable = buildEntityRefTable $ xmlEntities
xhtmlEntityRefTable = buildEntityRefTable $ xhtmlEntities
buildEntityRefTable :: [(String, Int)] -> EntityRefTable
buildEntityRefTable = M.fromList . map (\ (x,y) -> (y,x) )
escapeText'' :: (Char -> XmlTree) -> (Char -> Bool) -> XmlTree -> XmlTrees
escapeText'' escChar isEsc t
= maybe [t] escape' . XN.getText $ t
where
escape' "" = [t]
escape' s = escape s
escape ""
= []
escape (c:s1)
| isEsc c
= escChar c : escape s1
escape s
= XN.mkText s1 : escape s2
where
(s1, s2) = break isEsc s
escapeEntityRef :: EntityRefTable -> Char -> XmlTree
escapeEntityRef entityTable c
= maybe (XN.mkCharRef c') XN.mkEntityRef . M.lookup c' $ entityTable
where
c' = fromEnum c
escapeXmlEntityRef :: Char -> XmlTree
escapeXmlEntityRef = escapeEntityRef xmlEntityRefTable
escapeHtmlEntityRef :: Char -> XmlTree
escapeHtmlEntityRef = escapeEntityRef xhtmlEntityRefTable
escapeXmlDoc :: ArrowList a => a XmlTree XmlTree
escapeXmlDoc
= fromLA $ escapeDoc escXmlText escXmlAttrValue
where
escXmlText
= arrL $ escapeText'' escapeXmlEntityRef (`elem` "<&")
escXmlAttrValue
= arrL $ escapeText'' escapeXmlEntityRef (`elem` "<>\"\'&\n\r\t")
escapeHtmlDoc :: ArrowList a => a XmlTree XmlTree
escapeHtmlDoc
= fromLA $ escapeDoc escHtmlText escHtmlAttrValue
where
escHtmlText
= arrL $ escapeText'' escapeHtmlEntityRef isHtmlTextEsc
escHtmlAttrValue
= arrL $ escapeText'' escapeHtmlEntityRef isHtmlAttrEsc
isHtmlTextEsc c
= c >= toEnum(128) || ( c `elem` "<&" )
isHtmlAttrEsc c
= c >= toEnum(128) || ( c `elem` "<>\"\'&\n\r\t" )
escapeDoc :: LA XmlTree XmlTree -> LA XmlTree XmlTree -> LA XmlTree XmlTree
escapeDoc escText escAttr
= escape
where
escape
= choiceA
[ isElem :-> ( processChildren escape
>>>
processAttrl escVal
)
, isText :-> escText
, isDTD :-> processTopDown escDTD
, this :-> this
]
escVal = processChildren escAttr
escDTD = escVal `when` ( isDTDEntity <+> isDTDPEntity )
preventEmptyElements :: ArrowList a => [String] -> Bool -> a XmlTree XmlTree
preventEmptyElements ns isHtml
= fromLA $ insertDummyElem
where
isNoneEmpty
| not (null ns) = hasNameWith (localPart >>> (`elem` ns))
| isHtml = hasNameWith (localPart >>> (`notElem` emptyHtmlTags))
| otherwise = this
insertDummyElem
= processBottomUp
( replaceChildren (txt "")
`when`
( isElem
>>>
isNoneEmpty
>>>
neg getChildren
)
)
haskellRepOfXmlDoc :: ArrowList a => a XmlTree XmlTree
haskellRepOfXmlDoc
= fromLA $
root [getAttrl] [show ^>> mkText]
numberLinesInXmlDoc :: ArrowList a => a XmlTree XmlTree
numberLinesInXmlDoc
= fromLA $
processChildren (changeText numberLines)
where
numberLines :: String -> String
numberLines str
= concat $
zipWith (\ n l -> lineNr n ++ l ++ "\n") [1..] (lines str)
where
lineNr :: Int -> String
lineNr n = (reverse (take 6 (reverse (show n) ++ replicate 6 ' '))) ++ " "
treeRepOfXmlDoc :: ArrowList a => a XmlTree XmlTree
treeRepOfXmlDoc
= fromLA $
root [getAttrl] [formatXmlTree ^>> mkText]
addHeadlineToXmlDoc :: ArrowXml a => a XmlTree XmlTree
addHeadlineToXmlDoc
= fromLA $ ( addTitle $< (getAttrValue a_source >>^ formatTitle) )
where
addTitle str
= replaceChildren ( txt str <+> getChildren <+> txt "\n" )
formatTitle str
= "\n" ++ headline ++ "\n" ++ underline ++ "\n\n"
where
headline = "content of: " ++ str
underline = map (const '=') headline
removeComment' :: LA XmlTree XmlTree
removeComment' = none `when` isCmt
removeComment :: ArrowXml a => a XmlTree XmlTree
removeComment = fromLA $ removeComment'
removeAllComment :: ArrowXml a => a XmlTree XmlTree
removeAllComment = fromLA $ processBottomUp removeComment'
removeWhiteSpace' :: LA XmlTree XmlTree
removeWhiteSpace' = none `when` isWhiteSpace
removeWhiteSpace :: ArrowXml a => a XmlTree XmlTree
removeWhiteSpace = fromLA $ removeWhiteSpace'
removeAllWhiteSpace :: ArrowXml a => a XmlTree XmlTree
removeAllWhiteSpace = fromLA $ processBottomUp removeWhiteSpace'
removeDocWhiteSpace :: ArrowXml a => a XmlTree XmlTree
removeDocWhiteSpace = fromLA $ removeRootWhiteSpace
removeRootWhiteSpace :: LA XmlTree XmlTree
removeRootWhiteSpace
= processChildren processRootElement
`when`
isRoot
where
processRootElement :: LA XmlTree XmlTree
processRootElement
= removeWhiteSpace >>> processChild
where
processChild
= choiceA [ isDTD
:-> removeAllWhiteSpace
, this
:-> replaceChildren ( getChildren
>>. indentTrees insertNothing False 1
)
]
indentDoc :: ArrowXml a => a XmlTree XmlTree
indentDoc = fromLA $
( ( isRoot `guards` indentRoot )
`orElse`
(root [] [this] >>> indentRoot >>> getChildren)
)
indentRoot :: LA XmlTree XmlTree
indentRoot = processChildren indentRootChildren
where
indentRootChildren
= removeText >>> indentChild >>> insertNL
where
removeText = none `when` isText
insertNL = this <+> txt "\n"
indentChild = ( replaceChildren
( getChildren
>>.
indentTrees (insertIndentation 2) False 1
)
`whenNot` isDTD
)
indentTrees :: (Int -> LA XmlTree XmlTree) -> Bool -> Int -> XmlTrees -> XmlTrees
indentTrees _ _ _ []
= []
indentTrees indentFilter preserveSpace level ts
= runLAs lsf ls
++
indentRest rs
where
runLAs f l
= runLA (constL l >>> f) undefined
(ls, rs)
= break XN.isElem ts
isSignificant :: Bool
isSignificant
= preserveSpace
||
(not . null . runLAs isSignificantPart) ls
isSignificantPart :: LA XmlTree XmlTree
isSignificantPart
= catA
[ isText `guards` neg isWhiteSpace
, isCdata
, isCharRef
, isEntityRef
]
lsf :: LA XmlTree XmlTree
lsf
| isSignificant
= this
| otherwise
= (none `when` isWhiteSpace)
>>>
(indentFilter level <+> this)
indentRest :: XmlTrees -> XmlTrees
indentRest []
| isSignificant
= []
| otherwise
= runLA (indentFilter (level 1)) undefined
indentRest (t':ts')
= runLA ( ( indentElem
>>>
lsf
)
`when` isElem
) t'
++
( if null ts'
then indentRest
else indentTrees indentFilter preserveSpace level
) ts'
where
indentElem
= replaceChildren ( getChildren
>>.
indentChildren
)
xmlSpaceAttrValue :: String
xmlSpaceAttrValue
= concat . runLA (getAttrValue "xml:space") $ t'
preserveSpace' :: Bool
preserveSpace'
= ( fromMaybe preserveSpace
.
lookup xmlSpaceAttrValue
) [ ("preserve", True)
, ("default", False)
]
indentChildren :: XmlTrees -> XmlTrees
indentChildren cs'
| all (maybe False (all isXmlSpaceChar) . XN.getText) cs'
= []
| otherwise
= indentTrees indentFilter preserveSpace' (level + 1) cs'
insertIndentation :: Int -> Int -> LA a XmlTree
insertIndentation indentWidth level
= txt ('\n' : replicate (level * indentWidth) ' ')
insertNothing :: Int -> LA a XmlTree
insertNothing _ = none
transfCdata' :: LA XmlTree XmlTree
transfCdata' = (getCdata >>> mkText) `when` isCdata
transfCdata :: ArrowXml a => a XmlTree XmlTree
transfCdata = fromLA $
transfCdata'
transfAllCdata :: ArrowXml a => a XmlTree XmlTree
transfAllCdata = fromLA $
processBottomUp transfCdata'
transfCharRef' :: LA XmlTree XmlTree
transfCharRef' = ( getCharRef >>> arr (\ i -> [toEnum i]) >>> mkText )
`when`
isCharRef
transfCharRef :: ArrowXml a => a XmlTree XmlTree
transfCharRef = fromLA $
transfCharRef'
transfAllCharRef :: ArrowXml a => a XmlTree XmlTree
transfAllCharRef = fromLA $
processBottomUp transfCharRef'
rememberDTDAttrl :: ArrowList a => a XmlTree XmlTree
rememberDTDAttrl
= fromLA $
( ( addDTDAttrl $< ( getChildren >>> isDTDDoctype >>> getDTDAttrl ) )
`orElse`
this
)
where
addDTDAttrl al
= seqA . map (uncurry addAttr) . map (first (dtdPrefix ++)) $ al
addDefaultDTDecl :: ArrowList a => a XmlTree XmlTree
addDefaultDTDecl
= fromLA $
( addDTD $< listA (getAttrl >>> (getName &&& xshow getChildren) >>> hasDtdPrefix) )
where
hasDtdPrefix
= isA (fst >>> (dtdPrefix `isPrefixOf`))
>>>
arr (first (drop (length dtdPrefix)))
addDTD []
= this
addDTD al
= replaceChildren
( mkDTDDoctype al none
<+>
txt "\n"
<+>
( getChildren >>> (none `when` isDTDDoctype) )
)
hasXmlPi :: ArrowXml a => a XmlTree XmlTree
hasXmlPi
= fromLA
( getChildren
>>>
isPi
>>>
hasName t_xml
)
addXmlPi :: ArrowXml a => a XmlTree XmlTree
addXmlPi
= fromLA
( insertChildrenAt 0 ( ( mkPi (mkSNsName t_xml) none
>>>
addAttr a_version "1.0"
)
<+>
txt "\n"
)
`whenNot`
hasXmlPi
)
addXmlPiEncoding :: ArrowXml a => String -> a XmlTree XmlTree
addXmlPiEncoding enc
= fromLA $
processChildren ( addAttr a_encoding enc
`when`
( isPi >>> hasName t_xml )
)
addXHtmlDoctypeStrict
, addXHtmlDoctypeTransitional
, addXHtmlDoctypeFrameset :: ArrowXml a => a XmlTree XmlTree
addXHtmlDoctypeStrict
= addDoctypeDecl "html" "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
addXHtmlDoctypeTransitional
= addDoctypeDecl "html" "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"
addXHtmlDoctypeFrameset
= addDoctypeDecl "html" "-//W3C//DTD XHTML 1.0 Frameset//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"
addDoctypeDecl :: ArrowXml a => String -> String -> String -> a XmlTree XmlTree
addDoctypeDecl rootElem public system
= fromLA $
replaceChildren
( mkDTDDoctype ( ( if null public then id else ( (k_public, public) : ) )
.
( if null system then id else ( (k_system, system) : ) )
$ [ (a_name, rootElem) ]
) none
<+>
txt "\n"
<+>
getChildren
)