module Text.XML.HaXml.Html.ParseLazy
( htmlParse
) where
import Prelude hiding (either,maybe,sequence)
import qualified Prelude (either)
import Maybe hiding (maybe)
import Char (toLower, isSpace, isDigit, isHexDigit)
import Numeric (readDec,readHex)
import Monad
import Text.XML.HaXml.Types
import Text.XML.HaXml.Lex
import Text.XML.HaXml.Posn
import Text.ParserCombinators.Poly.NoLeak.Lazy
#if defined(DEBUG)
# if ( defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ > 502 ) || \
( defined(__NHC__) && __NHC__ > 114 ) || defined(__HUGS__)
import Debug.Trace(trace)
# elif defined(__GLASGOW_HASKELL__)
import IOExts(trace)
# elif defined(__NHC__) || defined(__HBC__)
import NonStdTrace
# endif
debug :: Monad m => String -> m ()
debug s = trace s (return ())
#else
debug :: Monad m => String -> m ()
debug s = return ()
#endif
htmlParse :: String -> String -> Document Posn
htmlParse name = fst . runParser document . xmlLex name
simplify :: Document i -> Document i
simplify (Document p st (Elem n avs cs) ms) =
Document p st (Elem n avs (deepfilter simp cs)) ms
where
simp (CElem (Elem "null" [] []) _) = False
simp (CElem (Elem n _ []) _) | n `elem` ["font","p","i","b","em"
,"tt","big","small"] = False
simp _ = True
deepfilter p =
filter p . map (\c-> case c of
CElem (Elem n avs cs) i
-> CElem (Elem n avs (deepfilter p cs)) i
_ -> c)
selfclosingtags = ["img","hr","br","meta","col","link","base"
,"param","area","frame","input"]
closeInnerTags =
[ ("ul", ["li"])
, ("ol", ["li"])
, ("dl", ["dt","dd"])
, ("tr", ["th","td"])
, ("div", ["p"])
, ("thead", ["th","tr","td"])
, ("tfoot", ["th","tr","td"])
, ("tbody", ["th","tr","td"])
, ("table", ["th","tr","td","thead","tfoot","tbody"])
, ("caption", ["p"])
, ("th", ["p"])
, ("td", ["p"])
, ("li", ["p"])
, ("dt", ["p"])
, ("dd", ["p"])
, ("object", ["p"])
, ("map", ["p"])
, ("body", ["p"])
]
closes :: Name -> Name -> Bool
"a" `closes` "a" = True
"li" `closes` "li" = True
"th" `closes` t | t `elem` ["th","td"] = True
"td" `closes` t | t `elem` ["th","td"] = True
"tr" `closes` t | t `elem` ["th","td","tr"] = True
"dt" `closes` t | t `elem` ["dt","dd"] = True
"dd" `closes` t | t `elem` ["dt","dd"] = True
"form" `closes` "form" = True
"label" `closes` "label" = True
_ `closes` "option" = True
"thead" `closes` t | t `elem` ["colgroup"] = True
"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True
"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True
"colgroup" `closes` "colgroup" = True
t `closes` "p"
| t `elem` ["p","h1","h2","h3","h4","h5","h6"
,"hr","div","ul","dl","ol","table"] = True
_ `closes` _ = False
fst3 (a,_,_) = a
snd3 (_,a,_) = a
thd3 (_,_,a) = a
type HParser a = Parser (Posn,TokenT) a
tok :: TokenT -> HParser TokenT
tok t = do (p,t') <- next
case t' of TokError s -> report failBad (show t) p t'
_ | t'==t -> return t
| otherwise -> report fail (show t) p t'
name :: HParser Name
name = do (p,tok) <- next
case tok of
TokName s -> return s
TokError s -> report failBad "a name" p tok
_ -> report fail "a name" p tok
string, freetext :: HParser String
string = do (p,t) <- next
case t of TokName s -> return s
_ -> report fail "text" p t
freetext = do (p,t) <- next
case t of TokFreeText s -> return s
_ -> report fail "text" p t
maybe :: HParser a -> HParser (Maybe a)
maybe p =
( p >>= return . Just) `onFail`
( return Nothing)
either :: HParser a -> HParser b -> HParser (Either a b)
either p q =
( p >>= return . Left) `onFail`
( q >>= return . Right)
word :: String -> HParser ()
word s = do { x <- next
; case x of
(p,TokName n) | s==n -> return ()
(p,TokFreeText n) | s==n -> return ()
(p,t@(TokError _)) -> report failBad (show s) p t
(p,t) -> report fail (show s) p t
}
posn :: HParser Posn
posn = do { x@(p,_) <- next
; reparse [x]
; return p
} `onFail` return noPos
nmtoken :: HParser NmToken
nmtoken = (string `onFail` freetext)
failP, failBadP :: String -> HParser a
failP msg = do { p <- posn; fail (msg++"\n at "++show p) }
failBadP msg = do { p <- posn; failBad (msg++"\n at "++show p) }
report :: (String->HParser a) -> String -> Posn -> TokenT -> HParser a
report fail exp p t = fail ("Expected "++show exp++" but found "++show t
++"\n at "++show p)
adjustErrP :: HParser a -> (String->String) -> HParser a
p `adjustErrP` f = p `onFail` do pn <- posn
(p `adjustErr` f) `adjustErr` (++show pn)
document :: HParser (Document Posn)
document = do
return Document
`apply` (prolog `adjustErr` ("unrecognisable XML prolog\n"++))
`apply` (return emptyST)
`apply` (do es <- many1 (element "HTML document")
return (case map snd es of
[e] -> e
es -> Elem "html" [] (map mkCElem es)))
`apply` (many misc)
where mkCElem e = CElem e noPos
comment :: HParser Comment
comment = do
bracket (tok TokCommentOpen) (tok TokCommentClose) freetext
processinginstruction :: HParser ProcessingInstruction
processinginstruction = do
tok TokPIOpen
commit $ do
n <- string `onFail` failP "processing instruction has no target"
f <- freetext
(tok TokPIClose `onFail` tok TokAnyClose) `onFail` failP "missing ?> or >"
return (n, f)
cdsect :: HParser CDSect
cdsect = do
tok TokSectionOpen
bracket (tok (TokSection CDATAx)) (tok TokSectionClose) chardata
prolog :: HParser Prolog
prolog = do
x <- maybe xmldecl
m1 <- many misc
dtd <- maybe doctypedecl
m2 <- many misc
return (Prolog x m1 dtd m2)
xmldecl :: HParser XMLDecl
xmldecl = do
tok TokPIOpen
(word "xml" `onFail` word "XML")
p <- posn
s <- freetext
tok TokPIClose `onFail` failBadP "missing ?> in <?xml ...?>"
(return . fst . runParser aux . xmlReLex p) s
where
aux = do
v <- versioninfo `onFail` failP "missing XML version info"
e <- maybe encodingdecl
s <- maybe sddecl
return (XMLDecl v e s)
versioninfo :: HParser VersionInfo
versioninfo = do
(word "version" `onFail` word "VERSION")
tok TokEqual
bracket (tok TokQuote) (tok TokQuote) freetext
misc :: HParser Misc
misc =
oneOf' [ ("<!--comment-->", comment >>= return . Comment)
, ("<?PI?>", processinginstruction >>= return . PI)
]
doctypedecl :: HParser DocTypeDecl
doctypedecl = do
tok TokSpecialOpen
tok (TokSpecial DOCTYPEx)
commit $ do
n <- name
eid <- maybe externalid
tok TokAnyClose `onFail` failP "missing > in DOCTYPE decl"
return (DTD n eid [])
sddecl :: HParser SDDecl
sddecl = do
(word "standalone" `onFail` word "STANDALONE")
commit $ do
tok TokEqual `onFail` failP "missing = in 'standalone' decl"
bracket (tok TokQuote) (tok TokQuote)
( (word "yes" >> return True) `onFail`
(word "no" >> return False) `onFail`
failP "'standalone' decl requires 'yes' or 'no' value" )
type Stack = [(Name,[Attribute])]
element :: Name -> HParser (Stack,Element Posn)
element ctx =
do
tok TokAnyOpen
(ElemTag e avs) <- elemtag
( if e `closes` ctx then
( do debug ("/")
unparse ([TokEndOpen, TokName ctx, TokAnyClose,
TokAnyOpen, TokName e] ++ reformatAttrs avs)
return ([], Elem "null" [] []))
else if e `elem` selfclosingtags then
( do tok TokEndClose
debug (e++"[+]")
return ([], Elem e avs [])) `onFail`
( do tok TokAnyClose
debug (e++"[+]")
return ([], Elem e avs []))
else
(( do tok TokEndClose
debug (e++"[]")
return ([], Elem e avs [])) `onFail`
( do tok TokAnyClose `onFail` failP "missing > or /> in element tag"
debug (e++"[")
return (\ interior-> let (stack,contained) = interior
in (stack, Elem e avs contained))
`apply`
(do zz <- manyFinally (content e)
(tok TokEndOpen)
n <- name
commit (tok TokAnyClose)
debug "]"
let (ss,cs) = unzip zz
let s = if null ss then [] else last ss
( if e == (map toLower n :: Name) then
do unparse (reformatTags (closeInner e s))
debug "^"
return ([], cs)
else
do unparse [TokEndOpen, TokName n, TokAnyClose]
debug "-"
return (((e,avs):s), cs)))
) `onFail` failP ("failed to repair non-matching tags in context: "++ctx)))
closeInner :: Name -> [(Name,[Attribute])] -> [(Name,[Attribute])]
closeInner c ts =
case lookup c closeInnerTags of
(Just these) -> filter ((`notElem` these).fst) ts
Nothing -> ts
unparse ts = do p <- posn
reparse (zip (repeat p) ts)
reformatAttrs avs = concatMap f0 avs
where f0 (a, v@(AttValue _)) = [TokName a, TokEqual, TokQuote,
TokFreeText (show v), TokQuote]
reformatTags ts = concatMap f0 ts
where f0 (t,avs) = [TokAnyOpen, TokName t]++reformatAttrs avs++[TokAnyClose]
content :: Name -> HParser (Stack,Content Posn)
content ctx = do { p <- posn ; content' p ctx }
where content' p ctx = oneOf'
[ ( "element", element ctx >>= \(s,e)-> return (s, CElem e p))
, ( "chardata", chardata >>= \s-> return ([], CString False s p))
, ( "reference", reference >>= \r-> return ([], CRef r p))
, ( "cdsect", cdsect >>= \c-> return ([], CString True c p))
, ( "misc", misc >>= \m-> return ([], CMisc m p))
] `adjustErrP` ("when looking for a content item,\n"++)
elemtag :: HParser ElemTag
elemtag = do
n <- name `adjustErrBad` ("malformed element tag\n"++)
as <- many attribute
return (ElemTag (map toLower n) as)
attribute :: HParser Attribute
attribute = do
n <- name
v <- (do tok TokEqual
attvalue) `onFail`
(return (AttValue [Left "TRUE"]))
return (map toLower n,v)
reference :: HParser Reference
reference = do
bracket (tok TokAmp) (tok TokSemi) (freetext >>= val)
where
val ('#':'x':i) | all isHexDigit i
= return . RefChar . fst . head . readHex $ i
val ('#':i) | all isDigit i
= return . RefChar . fst . head . readDec $ i
val name = return . RefEntity $ name
externalid :: HParser ExternalID
externalid =
( do word "SYSTEM"
s <- systemliteral
return (SYSTEM s)) `onFail`
( do word "PUBLIC"
p <- pubidliteral
s <- (systemliteral `onFail` return (SystemLiteral ""))
return (PUBLIC p s))
textdecl :: HParser TextDecl
textdecl = do
tok TokPIOpen
(word "xml" `onFail` word "XML")
v <- maybe versioninfo
e <- encodingdecl
tok TokPIClose `onFail` failP "expected ?> terminating text decl"
return (TextDecl v e)
encodingdecl :: HParser EncodingDecl
encodingdecl = do
(word "encoding" `onFail` word "ENCODING")
tok TokEqual `onFail` failBadP "expected = in 'encoding' decl"
f <- bracket (tok TokQuote) (tok TokQuote) freetext
return (EncodingDecl f)
publicid :: HParser PublicID
publicid = do
word "PUBLICID"
p <- pubidliteral
return (PUBLICID p)
entityvalue :: HParser EntityValue
entityvalue = do
evs <- bracket (tok TokQuote) (tok TokQuote) (many ev)
return (EntityValue evs)
ev :: HParser EV
ev =
( freetext >>= return . EVString) `onFail`
( reference >>= return . EVRef)
attvalue :: HParser AttValue
attvalue =
( do avs <- bracket (tok TokQuote) (tok TokQuote)
(many (either freetext reference))
return (AttValue avs) ) `onFail`
( do v <- nmtoken
s <- (tok TokPercent >> return "%") `onFail` return ""
return (AttValue [Left (v++s)]) ) `onFail`
( do s <- oneOf [ tok TokPlus >> return "+"
, tok TokHash >> return "#"
]
v <- nmtoken
return (AttValue [Left (s++v)]) ) `onFail`
failP "Badly formatted attribute value"
systemliteral :: HParser SystemLiteral
systemliteral = do
s <- bracket (tok TokQuote) (tok TokQuote) freetext
return (SystemLiteral s)
pubidliteral :: HParser PubidLiteral
pubidliteral = do
s <- bracket (tok TokQuote) (tok TokQuote) freetext
return (PubidLiteral s)
chardata :: HParser CharData
chardata = freetext