module Text.XML.HXT.Parser.HtmlParsec
( parseHtmlText
, parseHtmlDocument
, parseHtmlContent
, isEmptyHtmlTag
, isInnerHtmlTagOf
, closesHtmlTag
, emptyHtmlTags
)
where
import Text.XML.HXT.DOM.Interface
import Text.XML.HXT.DOM.XmlNode
( mkText
, mkError
, mkCmt
, mkElement
, mkAttr
, mkDTDElem
)
import Text.ParserCombinators.Parsec
( Parser
, SourcePos
, anyChar
, between
, char
, eof
, getPosition
, many
, noneOf
, option
, parse
, satisfy
, string
, try
, (<|>)
)
import Text.XML.HXT.Parser.XmlTokenParser
( allBut
, dq
, eq
, gt
, name
, pubidLiteral
, skipS
, skipS0
, sq
, systemLiteral
, singleCharsT
, referenceT
)
import Text.XML.HXT.Parser.XmlParsec
( cDSect
, charData'
, misc
, parseXmlText
, pI
, xMLDecl'
)
import Text.XML.HXT.Parser.XmlCharParser
( xmlChar
)
import Data.Maybe
( fromMaybe
)
import Data.Char
( toLower
, toUpper
)
parseHtmlText :: String -> XmlTree -> XmlTrees
parseHtmlText loc t
= parseXmlText htmlDocument loc $ t
parseHtmlFromString :: Parser XmlTrees -> String -> String -> XmlTrees
parseHtmlFromString parser loc
= either ((:[]) . mkError c_err . (++ "\n") . show) id . parse parser loc
parseHtmlDocument :: String -> String -> XmlTrees
parseHtmlDocument = parseHtmlFromString htmlDocument
parseHtmlContent :: String -> XmlTrees
parseHtmlContent = parseHtmlFromString htmlContent "text"
htmlDocument :: Parser XmlTrees
htmlDocument
= do
pl <- htmlProlog
el <- htmlContent
eof
return (pl ++ el)
htmlProlog :: Parser XmlTrees
htmlProlog
= do
xml <- option []
( try xMLDecl'
<|>
( do
pos <- getPosition
_ <- try (string "<?")
return $ [mkError c_warn (show pos ++ " wrong XML declaration")]
)
)
misc1 <- many misc
dtdPart <- option []
( try doctypedecl
<|>
( do
pos <- getPosition
_ <- try (upperCaseString "<!DOCTYPE")
return $ [mkError c_warn (show pos ++ " HTML DOCTYPE declaration ignored")]
)
)
return (xml ++ misc1 ++ dtdPart)
doctypedecl :: Parser XmlTrees
doctypedecl
= between (try $ upperCaseString "<!DOCTYPE") (char '>')
( do
skipS
n <- name
exId <- ( do
skipS
option [] externalID
)
skipS0
return [mkDTDElem DOCTYPE ((a_name, n) : exId) []]
)
externalID :: Parser Attributes
externalID
= do
_ <- try (upperCaseString k_public)
skipS
pl <- pubidLiteral
sl <- option "" $ try ( do
skipS
systemLiteral
)
return $ (k_public, pl) : if null sl then [] else [(k_system, sl)]
htmlContent :: Parser XmlTrees
htmlContent
= option []
( do
context <- hContent ([], [])
pos <- getPosition
return $ closeTags pos context
)
where
closeTags _ (body, [])
= reverse body
closeTags pos' (body, ((tn, al, body1) : restOpen))
= closeTags pos' (addHtmlWarn (show pos' ++ ": no closing tag found for \"<" ++ tn ++ " ...>\"")
.
addHtmlTag tn al body
$
(body1, restOpen)
)
type OpenTags = [(String, XmlTrees, XmlTrees)]
type Context = (XmlTrees, OpenTags)
hElement :: Context -> Parser Context
hElement context
= ( do
t <- hSimpleData
return (addHtmlElems [t] context)
)
<|>
hOpenTag context
<|>
hCloseTag context
<|>
( do
pos <- getPosition
c <- xmlChar
return ( addHtmlWarn (show pos ++ " markup char " ++ show c ++ " not allowed in this context")
.
addHtmlElems [mkText [c]]
$
context
)
)
<|>
( do
pos <- getPosition
c <- anyChar
return ( addHtmlWarn ( show pos
++ " illegal data in input or illegal XML char "
++ show c
++ " found and ignored, possibly wrong encoding scheme used")
$
context
)
)
hSimpleData :: Parser XmlTree
hSimpleData
= charData'
<|>
try referenceT
<|>
try hComment
<|>
try pI
<|>
try cDSect
hCloseTag :: Context -> Parser Context
hCloseTag context
= do
n <- try ( do
_ <- string "</"
lowerCaseName
)
skipS0
pos <- getPosition
checkSymbol gt ("closing > in tag \"</" ++ n ++ "\" expected") (closeTag pos n context)
hOpenTag :: Context -> Parser Context
hOpenTag context
= ( do
pos <- getPosition
e <- hOpenTagStart
hOpenTagRest pos e context
)
hOpenTagStart :: Parser (String, XmlTrees)
hOpenTagStart
= do
n <- try ( do
_ <- char '<'
n <- lowerCaseName
return n
)
skipS0
as <- hAttrList
return (n, as)
hOpenTagRest :: SourcePos -> (String, XmlTrees) -> Context -> Parser Context
hOpenTagRest pos (tn, al) context
= ( do
_ <- try $ string "/>"
return (addHtmlTag tn al [] context)
)
<|>
( do
context1 <- checkSymbol gt ("closing > in tag \"<" ++ tn ++ "...\" expected") context
return ( let context2 = closePrevTag pos tn context1
in
( if isEmptyHtmlTag tn
then addHtmlTag tn al []
else openTag tn al
) context2
)
)
hAttrList :: Parser XmlTrees
hAttrList
= many (try hAttribute)
where
hAttribute
= do
n <- lowerCaseName
v <- hAttrValue
skipS0
return $ mkAttr (mkName n) v
hAttrValue :: Parser XmlTrees
hAttrValue
= option []
( try ( do
eq
hAttrValue'
)
)
hAttrValue' :: Parser XmlTrees
hAttrValue'
= try ( between dq dq (hAttrValue'' "&\"") )
<|>
try ( between sq sq (hAttrValue'' "&\'") )
<|>
( do
cs <- many (noneOf " \r\t\n>\"\'")
return [mkText cs]
)
hAttrValue'' :: String -> Parser XmlTrees
hAttrValue'' notAllowed
= many ( hReference' <|> singleCharsT notAllowed)
hReference' :: Parser XmlTree
hReference'
= try referenceT
<|>
( do
_ <- char '&'
return (mkText "&")
)
hContent :: Context -> Parser Context
hContent context
= option context
( do
context1 <- hElement context
hContent context1
)
hComment :: Parser XmlTree
hComment
= do
c <- between (try $ string "<!--") (string "-->") (allBut many "-->")
return (mkCmt c)
checkSymbol :: Parser a -> String -> Context -> Parser Context
checkSymbol p msg context
= do
pos <- getPosition
option (addHtmlWarn (show pos ++ " " ++ msg) context)
( do
_ <- try p
return context
)
lowerCaseName :: Parser String
lowerCaseName
= do
n <- name
return (map toLower n)
upperCaseString :: String -> Parser String
upperCaseString
= sequence . map (\ c -> satisfy (( == c) . toUpper))
addHtmlTag :: String -> XmlTrees -> XmlTrees -> Context -> Context
addHtmlTag tn al body (body1, openTags)
= ([mkElement (mkName tn) al (reverse body)] ++ body1, openTags)
addHtmlWarn :: String -> Context -> Context
addHtmlWarn msg
= addHtmlElems [mkError c_warn msg]
addHtmlElems :: XmlTrees -> Context -> Context
addHtmlElems elems (body, openTags)
= (reverse elems ++ body, openTags)
openTag :: String -> XmlTrees -> Context -> Context
openTag tn al (body, openTags)
= ([], (tn, al, body) : openTags)
closeTag :: SourcePos -> String -> Context -> Context
closeTag pos n context
| n `elem` (map ( \ (n1, _, _) -> n1) $ snd context)
= closeTag' n context
| otherwise
= addHtmlWarn (show pos ++ " no opening tag found for </" ++ n ++ ">")
.
addHtmlTag n [] []
$
context
where
closeTag' n' (body', (n1, al1, body1) : restOpen)
= close context1
where
context1
= addHtmlTag n1 al1 body' (body1, restOpen)
close
| n' == n1
= id
| n1 `isInnerHtmlTagOf` n'
= closeTag pos n'
| otherwise
= addHtmlWarn (show pos ++ " no closing tag found for \"<" ++ n1 ++ " ...>\"")
.
closeTag' n'
closeTag' _ _
= error "illegal argument for closeTag'"
closePrevTag :: SourcePos -> String -> Context -> Context
closePrevTag _pos _n context@(_body, [])
= context
closePrevTag pos n context@(body, (n1, al1, body1) : restOpen)
| n `closes` n1
= closePrevTag pos n
( addHtmlWarn (show pos ++ " tag \"<" ++ n1 ++ " ...>\" implicitly closed by opening tag \"<" ++ n ++ " ...>\"")
.
addHtmlTag n1 al1 body
$
(body1, restOpen)
)
| otherwise
= context
isEmptyHtmlTag :: String -> Bool
isEmptyHtmlTag n
= n `elem`
emptyHtmlTags
emptyHtmlTags :: [String]
emptyHtmlTags
= [ "area"
, "base"
, "br"
, "col"
, "frame"
, "hr"
, "img"
, "input"
, "link"
, "meta"
, "param"
]
isInnerHtmlTagOf :: String -> String -> Bool
n `isInnerHtmlTagOf` tn
= n `elem`
( fromMaybe [] . lookup tn
$ [ ("body", ["p"])
, ("caption", ["p"])
, ("dd", ["p"])
, ("div", ["p"])
, ("dl", ["dt","dd"])
, ("dt", ["p"])
, ("li", ["p"])
, ("map", ["p"])
, ("object", ["p"])
, ("ol", ["li"])
, ("table", ["th","tr","td","thead","tfoot","tbody"])
, ("tbody", ["th","tr","td"])
, ("td", ["p"])
, ("tfoot", ["th","tr","td"])
, ("th", ["p"])
, ("thead", ["th","tr","td"])
, ("tr", ["th","td"])
, ("ul", ["li"])
]
)
closesHtmlTag
, closes :: String -> String -> Bool
closesHtmlTag = closes
"a" `closes` "a" = True
"li" `closes` "li" = True
"th" `closes` t | t `elem` ["th","td"] = True
"td" `closes` t | t `elem` ["th","td"] = True
"tr" `closes` t | t `elem` ["th","td","tr"] = True
"dt" `closes` t | t `elem` ["dt","dd"] = True
"dd" `closes` t | t `elem` ["dt","dd"] = True
"hr" `closes` "p" = True
"colgroup"
`closes` "colgroup" = True
"form" `closes` "form" = True
"label" `closes` "label" = True
"map" `closes` "map" = True
"object"
`closes` "object" = True
_ `closes` t | t `elem` ["option"
,"script"
,"style"
,"textarea"
,"title"
] = True
t `closes` "select" | t /= "option" = True
"thead" `closes` t | t `elem` ["colgroup"] = True
"tfoot" `closes` t | t `elem` ["thead"
,"colgroup"] = True
"tbody" `closes` t | t `elem` ["tbody"
,"tfoot"
,"thead"
,"colgroup"] = True
t `closes` t2 | t `elem` ["h1","h2","h3"
,"h4","h5","h6"
,"dl","ol","ul"
,"table"
,"div","p"
]
&&
t2 `elem` ["h1","h2","h3"
,"h4","h5","h6"
,"p"
] = True
_ `closes` _ = False