module HtmlParser2 (parseHtml,parseHtmlText,parseHtmlBlocks) where import Html import HtmlTags import HtmlLex import ParsOps3 import HtmlParsOps import Utils2(isSpace') -- | 'parseHtml' is an HTML parser. It normally returns @Right html@. -- The parser is built using error-correcting parsing combinators, so it should -- not fail, but if it does it returns @Left errormessage@. -- -- 'parseHtml' parses /complete/ HTML documents. The structure of the -- 'Html' value follows the structure described in the HTML 4.0 standard. parseHtml :: String -> Either ([String],String) Html parseHtml = parse' htmlDocument parseHtmlText :: String -> Either ([String],String) Html parseHtmlText = parse' optText -- ^ Parse a text level HTML fragment parseHtmlBlocks :: String -> Either ([String],String) Html parseHtmlBlocks = parse' blocks -- ^ Parse a block level HTML fragment parse' part s = case parseToEof part (htmlLex s) of Right html -> Right html Left (ts, es) -> Left (map sh1 es, showHtmlLex ts) htmlDocument = two <$> doctype <*> htmlDoc <* whitespace -- one <$> htmlDoc where doctype = space *> optional defaultDoctype (comment <* whitespace) where defaultDoctype = garbage doctypehtml40 htmlDoc = impliedCtx HTML headAndBody doctypehtml40 = "!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\"" headAndBody = -- one <$> headPart (:) <$> headPart <* whitespace <*> framesetAndBody headPart = whitespace *> impliedCtx HEAD (many headElement) where headElement = title <|> headCmd <|> script <|> style <|> badHeaderStuff where title = ctx1 TITLE plaintext headCmd = cmd [ISINDEX, LINK, BASE,META] --"NEXTID" script = anythingButCtx SCRIPT style = anythingButCtx STYLE badHeaderStuff = whitespace1 framesetAndBody = (++) <$> optFramesetPart <*> noFramesPart optFramesetPart = optional [] (one <$> frameset) where frameset = ctx1' FRAMESET frames frames = whitespace *> some frame frame = (cmd1 FRAME <|> frameset) <* whitespace noFramesPart = whitespace *> (one <$> (noframes <|> bodyPart)) where noframes = ctx1' NOFRAMES (one <$> bodyPart) bodyPart = whitespace *> impliedCtx BODY blocks <* whitespace blocks = trimctx <$> many block block = {-bclean <$>-} block' block' = block'' <|> (p0 . trim) <$> text block'' = ctx [UL,OL,DIR,MENU] list <|> ctx1 DL dlist <|> ctx' [H1 .. H6] optText <|> ctx nestedBlockTags blocks <|> ctx1 TABLE table <|> ctx1 FORM blocks -- form <|> ctx1 PRE optPreText <|> ctx1' ADDRESS optText --

is allowed too! <|> cmd1 HR <|> cmd1 ISINDEX -- only allowed in head according to standard <|> ctx1' P optText where nestedBlockTags = [BLOCKQUOTE,DIV,CENTER,MAIN,ARTICLE,ASIDE,DETAILS, FIGCAPTION,FIGURE,FOOTER,HEADER,HGROUP,NAV,SECTION] list = whitespace *> (trimctx <$> many listElement) where listElement = (ctx1' LI blocks <* whitespace) <|> badCtx LI block -- allows text outside

  • ...
  • dlist = trimctx <$> many dlistElement where dlistElement = ctx1' DT blocks -- std: only text allowed <|> ctx1' DD blocks <|> badCtx DD block -- allows white space table = whitespace *> many (tableElement <* whitespace) where tableElement = ctx1 CAPTION optText <|> ctx [THEAD,TFOOT,TBODY] (many tr) <|> tr <|> formElement tr = ctx1' TR row <|> badCtx' TR row' row = whitespace *> many (cell <* whitespace) row' = some cell cell = ctx' [TH,TD] blocks -- <|> badCtx TD block -- allows white space -- form = many formElement -- elements can contain block elements in HTML5... -- https://www.w3.org/TR/html5/textlevel-semantics.html#the-a-element flow = many (block'' <|> textElement) optText = optional [] text optPreText = optional [] preText --text = text' chars preText = text --text' prechars --text' chars = txt text = trimctx <$> some textElement textElement = plain <|> special <|> chars <|> comment where txt0 = optional [] text -- Note: no direct or indirect references to text or preText below! plain = ctx' textLevelTags txt0 special = cmd [BR,IMG,BASEFONT] <|> ctx1 A flow <|> ctx [APPLET,FUPPLET,OBJECT] applet <|> ctx [AUDIO,VIDEO,PICTURE] media <|> ctx [IFRAME,BUTTON,LABEL,METER] txt0 <|> ctx1 MAP map <|> formElement <|> ctx1 SVG whitespace <|> anythingButCtx SCRIPT <|> anythingButCtx STYLE where applet = many appletElement where appletElement = cmd1 PARAM <|> textElement media = many mediaElement where mediaElement = cmd1 SOURCE <|> textElement map = many mapElement where mapElement = cmd1 AREA <|> whitespace1 formElement = cmd1 INPUT <|> ctx1 SELECT slist <|> ctx1 TEXTAREA plaintext -- Form elements are only allowed inside forms, but can -- occur in nested elements, so using a separate form -- parser (like for tables) is no good. where slist = many slistElement where slistElement = ctx1' OPTION plaintext ---} plaintext = many chars -- Text level markup: logicalTags = [EM,STRONG,DFN,CODE,SAMP,KBD,VAR,CITE,Q,SPAN,ABBR,ACRONYM,DEL,INS, TIME,MARK,BDI] physicalTags = [TT,I,B,U,STRIKE,BIG,SMALL,SUB,SUP] -- All text level tags for which we allow the end tag to be missing: textLevelTags = FONT:NOBR:logicalTags++physicalTags one x = [x] two x y = [x,y] --impliedCtx' t p = ctx [t] p p0 = HtmlContext (P,implicit) -- Used for handling bad html: badCtx t = fmap (HtmlContext (t,noAttrs) . trimctx . one) badCtx' t = fmap (HtmlContext (t,noAttrs) . trimctx) {- bclean item = case item of HtmlContext t@(n,_) html | n/=PRE -> HtmlContext t (trim html) _ -> item -} --trim = id --trimctx = id --{- trim [] = [] trim (HtmlChars s:html) = case dropWhile isSpace' s of "" -> trim html s -> HtmlChars s:trim html trim (HtmlContext t []:html) | okToTrim t = trim html trim html =trimctx html trimctx [] = [] trimctx (HtmlContext ta []:html) | okToTrim ta = trimctx html trimctx (item:html) = item:trimctx html okToTrim t = not (isTarget t || keep t) where isTarget (t,attrs) = t==A && hasAttr "NAME" attrs || hasAttr "ID" attrs keep (t,_) = t `elem` [TEXTAREA,TD,TH] --}