module Text.Pandoc.Readers.HTML ( readHtml
, htmlTag
, htmlInBalanced
, isInlineTag
, isBlockTag
, isTextTag
, isCommentTag
) where
import Text.ParserCombinators.Parsec
import Text.ParserCombinators.Parsec.Pos
import Text.HTML.TagSoup
import Text.HTML.TagSoup.Match
import Text.Pandoc.Definition
import Text.Pandoc.Builder (text, toList)
import Text.Pandoc.Shared
import Text.Pandoc.Parsing
import Data.Maybe ( fromMaybe, isJust )
import Data.List ( intercalate )
import Data.Char ( isSpace, isDigit )
import Control.Monad ( liftM, guard )
readHtml :: ParserState
-> String
-> Pandoc
readHtml st inp = Pandoc meta blocks
where blocks = readWith parseBody st rest
tags = canonicalizeTags $
parseTagsOptions parseOptions{ optTagPosition = True } inp
hasHeader = any (~== TagOpen "head" []) tags
(meta, rest) = if hasHeader
then parseHeader tags
else (Meta [] [] [], tags)
type TagParser = GenParser (Tag String) ParserState
parseHeader :: [Tag String] -> (Meta, [Tag String])
parseHeader tags = (Meta{docTitle = tit'', docAuthors = [], docDate = []}, rest)
where (tit,_) = break (~== TagClose "title") $ drop 1 $
dropWhile (\t -> not $ t ~== TagOpen "title" []) tags
tit' = concatMap fromTagText $ filter isTagText tit
tit'' = normalizeSpaces $ toList $ text tit'
rest = drop 1 $ dropWhile (\t -> not $ t ~== TagClose "head" ||
t ~== TagOpen "body" []) tags
parseBody :: TagParser [Block]
parseBody = liftM concat $ manyTill block eof
block :: TagParser [Block]
block = choice
[ pPara
, pHeader
, pBlockQuote
, pCodeBlock
, pList
, pHrule
, pSimpleTable
, pPlain
, pRawHtmlBlock
]
renderTags' :: [Tag String] -> String
renderTags' = renderTagsOptions
renderOptions{ optMinimize = (`elem` ["hr","br","img"]) }
pList :: TagParser [Block]
pList = pBulletList <|> pOrderedList <|> pDefinitionList
pBulletList :: TagParser [Block]
pBulletList = try $ do
pSatisfy (~== TagOpen "ul" [])
let nonItem = pSatisfy (\t ->
not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) &&
not (t ~== TagClose "ul"))
skipMany nonItem
items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ul")
return [BulletList items]
pOrderedList :: TagParser [Block]
pOrderedList = try $ do
TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" [])
st <- getState
let (start, style) = if stateStrict st
then (1, DefaultStyle)
else (sta', sty')
where sta = fromMaybe "1" $
lookup "start" attribs
sta' = if all isDigit sta
then read sta
else 1
sty = fromMaybe (fromMaybe "" $
lookup "style" attribs) $
lookup "class" attribs
sty' = case sty of
"lower-roman" -> LowerRoman
"upper-roman" -> UpperRoman
"lower-alpha" -> LowerAlpha
"upper-alpha" -> UpperAlpha
"decimal" -> Decimal
_ -> DefaultStyle
let nonItem = pSatisfy (\t ->
not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) &&
not (t ~== TagClose "ol"))
skipMany nonItem
items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ol")
return [OrderedList (start, style, DefaultDelim) items]
pDefinitionList :: TagParser [Block]
pDefinitionList = try $ do
pSatisfy (~== TagOpen "dl" [])
items <- manyTill pDefListItem (pCloses "dl")
return [DefinitionList items]
pDefListItem :: TagParser ([Inline],[[Block]])
pDefListItem = try $ do
let nonItem = pSatisfy (\t -> not (t ~== TagOpen "dt" []) &&
not (t ~== TagOpen "dd" []) && not (t ~== TagClose "dl"))
terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline)
defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block)
skipMany nonItem
let term = intercalate [LineBreak] terms
return (term, defs)
pRawTag :: TagParser String
pRawTag = do
tag <- pAnyTag
let ignorable x = x `elem` ["html","head","body"]
if tagOpen ignorable (const True) tag || tagClose ignorable tag
then return []
else return $ renderTags' [tag]
pRawHtmlBlock :: TagParser [Block]
pRawHtmlBlock = do
raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag
state <- getState
if stateParseRaw state && not (null raw)
then return [RawBlock "html" raw]
else return []
pHtmlBlock :: String -> TagParser String
pHtmlBlock t = try $ do
open <- pSatisfy (~== TagOpen t [])
contents <- manyTill pAnyTag (pSatisfy (~== TagClose t))
return $ renderTags' $ [open] ++ contents ++ [TagClose t]
pHeader :: TagParser [Block]
pHeader = try $ do
TagOpen tagtype attr <- pSatisfy $
tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"])
(const True)
let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")]
let level = read (drop 1 tagtype)
contents <- liftM concat $ manyTill inline (pCloses tagtype <|> eof)
return $ if bodyTitle
then []
else [Header level $ normalizeSpaces contents]
pHrule :: TagParser [Block]
pHrule = do
pSelfClosing (=="hr") (const True)
return [HorizontalRule]
pSimpleTable :: TagParser [Block]
pSimpleTable = try $ do
TagOpen _ _ <- pSatisfy (~== TagOpen "table" [])
skipMany pBlank
head' <- option [] $ pInTags "th" pTd
rows <- many1 $ try $
skipMany pBlank >> pInTags "tr" pTd
skipMany pBlank
TagClose _ <- pSatisfy (~== TagClose "table")
let cols = maximum $ map length rows
let aligns = replicate cols AlignLeft
let widths = replicate cols 0
return [Table [] aligns widths head' rows]
pTd :: TagParser [TableCell]
pTd = try $ do
skipMany pBlank
res <- pInTags "td" pPlain
skipMany pBlank
return [res]
pBlockQuote :: TagParser [Block]
pBlockQuote = do
contents <- pInTags "blockquote" block
return [BlockQuote contents]
pPlain :: TagParser [Block]
pPlain = do
contents <- liftM (normalizeSpaces . concat) $ many1 inline
if null contents
then return []
else return [Plain contents]
pPara :: TagParser [Block]
pPara = do
contents <- pInTags "p" inline
return [Para $ normalizeSpaces contents]
pCodeBlock :: TagParser [Block]
pCodeBlock = try $ do
TagOpen _ attr <- pSatisfy (~== TagOpen "pre" [])
contents <- manyTill pAnyTag (pCloses "pre" <|> eof)
let rawText = concatMap fromTagText $ filter isTagText contents
let result' = case rawText of
'\n':xs -> xs
_ -> rawText
let result = case reverse result' of
'\n':_ -> init result'
_ -> result'
let attribsId = fromMaybe "" $ lookup "id" attr
let attribsClasses = words $ fromMaybe "" $ lookup "class" attr
let attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
st <- getState
let attribs = if stateStrict st
then ("",[],[])
else (attribsId, attribsClasses, attribsKV)
return [CodeBlock attribs result]
inline :: TagParser [Inline]
inline = choice
[ pTagText
, pEmph
, pStrong
, pSuperscript
, pSubscript
, pStrikeout
, pLineBreak
, pLink
, pImage
, pCode
, pRawHtmlInline
]
pLocation :: TagParser ()
pLocation = do
(TagPosition r c) <- pSat isTagPosition
setPosition $ newPos "input" r c
pSat :: (Tag String -> Bool) -> TagParser (Tag String)
pSat f = do
pos <- getPosition
token show (const pos) (\x -> if f x then Just x else Nothing)
pSatisfy :: (Tag String -> Bool) -> TagParser (Tag String)
pSatisfy f = try $ optional pLocation >> pSat f
pAnyTag :: TagParser (Tag String)
pAnyTag = pSatisfy (const True)
pSelfClosing :: (String -> Bool) -> ([Attribute String] -> Bool)
-> TagParser (Tag String)
pSelfClosing f g = do
open <- pSatisfy (tagOpen f g)
optional $ pSatisfy (tagClose f)
return open
pEmph :: TagParser [Inline]
pEmph = pInlinesInTags "em" Emph <|> pInlinesInTags "i" Emph
pStrong :: TagParser [Inline]
pStrong = pInlinesInTags "strong" Strong <|> pInlinesInTags "b" Strong
pSuperscript :: TagParser [Inline]
pSuperscript = failIfStrict >> pInlinesInTags "sup" Superscript
pSubscript :: TagParser [Inline]
pSubscript = failIfStrict >> pInlinesInTags "sub" Subscript
pStrikeout :: TagParser [Inline]
pStrikeout = do
failIfStrict
pInlinesInTags "s" Strikeout <|>
pInlinesInTags "strike" Strikeout <|>
pInlinesInTags "del" Strikeout <|>
try (do pSatisfy (~== TagOpen "span" [("class","strikeout")])
contents <- liftM concat $ manyTill inline (pCloses "span")
return [Strikeout contents])
pLineBreak :: TagParser [Inline]
pLineBreak = do
pSelfClosing (=="br") (const True)
return [LineBreak]
pLink :: TagParser [Inline]
pLink = try $ do
tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href"))
let url = fromAttrib "href" tag
let title = fromAttrib "title" tag
lab <- liftM concat $ manyTill inline (pCloses "a")
return [Link (normalizeSpaces lab) (escapeURI url, title)]
pImage :: TagParser [Inline]
pImage = do
tag <- pSelfClosing (=="img") (isJust . lookup "src")
let url = fromAttrib "src" tag
let title = fromAttrib "title" tag
let alt = fromAttrib "alt" tag
return [Image (toList $ text alt) (escapeURI url, title)]
pCode :: TagParser [Inline]
pCode = try $ do
(TagOpen open attr) <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True)
result <- manyTill pAnyTag (pCloses open)
let ident = fromMaybe "" $ lookup "id" attr
let classes = words $ fromMaybe [] $ lookup "class" attr
let rest = filter (\(x,_) -> x /= "id" && x /= "class") attr
return [Code (ident,classes,rest)
$ intercalate " " $ lines $ innerText result]
pRawHtmlInline :: TagParser [Inline]
pRawHtmlInline = do
result <- pSatisfy (tagComment (const True)) <|> pSatisfy isInlineTag
state <- getState
if stateParseRaw state
then return [RawInline "html" $ renderTags' [result]]
else return []
pInlinesInTags :: String -> ([Inline] -> Inline)
-> TagParser [Inline]
pInlinesInTags tagtype f = do
contents <- pInTags tagtype inline
return [f contents]
pInTags :: String -> TagParser [a]
-> TagParser [a]
pInTags tagtype parser = try $ do
pSatisfy (~== TagOpen tagtype [])
liftM concat $ manyTill parser (pCloses tagtype <|> eof)
pCloses :: String -> TagParser ()
pCloses tagtype = try $ do
t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag
case t of
(TagClose t') | t' == tagtype -> pAnyTag >> return ()
(TagOpen t' _) | t' `closes` tagtype -> return ()
(TagClose "ul") | tagtype == "li" -> return ()
(TagClose "ol") | tagtype == "li" -> return ()
(TagClose "dl") | tagtype == "li" -> return ()
_ -> pzero
pTagText :: TagParser [Inline]
pTagText = try $ do
(TagText str) <- pSatisfy isTagText
st <- getState
case runParser (many pTagContents) st "text" str of
Left _ -> fail $ "Could not parse `" ++ str ++ "'"
Right result -> return result
pBlank :: TagParser ()
pBlank = try $ do
(TagText str) <- pSatisfy isTagText
guard $ all isSpace str
pTagContents :: GenParser Char ParserState Inline
pTagContents = pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol
pStr :: GenParser Char ParserState Inline
pStr = liftM Str $ many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c)
isSpecial :: Char -> Bool
isSpecial '"' = True
isSpecial '\'' = True
isSpecial '.' = True
isSpecial '-' = True
isSpecial '\8216' = True
isSpecial '\8217' = True
isSpecial '\8220' = True
isSpecial '\8221' = True
isSpecial _ = False
pSymbol :: GenParser Char ParserState Inline
pSymbol = satisfy isSpecial >>= return . Str . (:[])
pSpace :: GenParser Char ParserState Inline
pSpace = many1 (satisfy isSpace) >> return Space
eitherBlockOrInline :: [String]
eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins",
"map", "area", "object"]
blockHtmlTags :: [String]
blockHtmlTags = ["address", "blockquote", "body", "center", "dir", "div",
"dl", "fieldset", "form", "h1", "h2", "h3", "h4",
"h5", "h6", "head", "hr", "html", "isindex", "menu",
"noframes", "noscript", "ol", "p", "pre", "table", "ul", "dd",
"dt", "frameset", "li", "tbody", "td", "tfoot",
"th", "thead", "tr", "script", "style"]
isInlineTag :: Tag String -> Bool
isInlineTag t = tagOpen (`notElem` blockHtmlTags) (const True) t ||
tagClose (`notElem` blockHtmlTags) t ||
tagComment (const True) t
isBlockTag :: Tag String -> Bool
isBlockTag t = tagOpen (`elem` blocktags) (const True) t ||
tagClose (`elem` blocktags) t ||
tagComment (const True) t
where blocktags = blockHtmlTags ++ eitherBlockOrInline
isTextTag :: Tag String -> Bool
isTextTag = tagText (const True)
isCommentTag :: Tag String -> Bool
isCommentTag = tagComment (const True)
closes :: String -> String -> Bool
_ `closes` "body" = False
_ `closes` "html" = False
"a" `closes` "a" = True
"li" `closes` "li" = True
"th" `closes` t | t `elem` ["th","td"] = True
"tr" `closes` t | t `elem` ["th","td","tr"] = True
"dt" `closes` t | t `elem` ["dt","dd"] = True
"hr" `closes` "p" = True
"p" `closes` "p" = True
"meta" `closes` "meta" = True
"colgroup" `closes` "colgroup" = True
"form" `closes` "form" = True
"label" `closes` "label" = True
"map" `closes` "map" = True
"object" `closes` "object" = True
_ `closes` t | t `elem` ["option","style","script","textarea","title"] = True
t `closes` "select" | t /= "option" = True
"thead" `closes` t | t `elem` ["colgroup"] = True
"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True
"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True
t `closes` t2 |
t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","p"] &&
t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True
t1 `closes` t2 |
t1 `elem` blockHtmlTags &&
t2 `notElem` (blockHtmlTags ++ eitherBlockOrInline) = True
_ `closes` _ = False
htmlInBalanced :: (Tag String -> Bool) -> GenParser Char ParserState String
htmlInBalanced f = try $ do
(TagOpen t _, tag) <- htmlTag f
guard $ '/' `notElem` tag
let nonTagChunk = many1 $ satisfy (/= '<')
let stopper = htmlTag (~== TagClose t)
let anytag = liftM snd $ htmlTag (const True)
contents <- many $ notFollowedBy' stopper >>
(nonTagChunk <|> htmlInBalanced (const True) <|> anytag)
endtag <- liftM snd stopper
return $ tag ++ concat contents ++ endtag
htmlTag :: (Tag String -> Bool) -> GenParser Char ParserState (Tag String, String)
htmlTag f = try $ do
lookAhead (char '<')
(next : _) <- getInput >>= return . canonicalizeTags . parseTags
guard $ f next
case next of
TagComment s -> do
count (length s + 4) anyChar
skipMany (satisfy (/='>'))
char '>'
return (next, "<!--" ++ s ++ "-->")
_ -> do
rendered <- manyTill anyChar (char '>')
return (next, rendered ++ ">")