module Text.Pandoc.Readers.HTML ( readHtml
, htmlTag
, htmlInBalanced
, isInlineTag
, isBlockTag
, NamedTag(..)
, isTextTag
, isCommentTag
) where
import Control.Applicative ((<|>))
import Control.Arrow ((***))
import Control.Monad (guard, mplus, msum, mzero, unless, void)
import Control.Monad.Except (throwError)
import Control.Monad.Reader (ReaderT, ask, asks, lift, local, runReaderT)
import Data.Char (isAlphaNum, isDigit, isLetter)
import Data.Default (Default (..), def)
import Data.Foldable (for_)
import Data.List (intercalate, isPrefixOf)
import Data.List.Split (wordsBy)
import qualified Data.Map as M
import Data.Maybe (fromMaybe, isJust, isNothing)
import Data.Monoid (First (..))
import Data.Monoid ((<>))
import qualified Data.Set as Set
import Data.Text (Text)
import qualified Data.Text as T
import Network.URI (URI, nonStrictRelativeTo, parseURIReference)
import Text.HTML.TagSoup
import Text.HTML.TagSoup.Match
import Text.Pandoc.Builder (Blocks, HasMeta (..), Inlines, trimInlines)
import qualified Text.Pandoc.Builder as B
import Text.Pandoc.Class (PandocMonad (..))
import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps)
import Text.Pandoc.Definition
import Text.Pandoc.Error
import Text.Pandoc.Logging
import Text.Pandoc.Options (Extension (Ext_epub_html_exts, Ext_native_divs, Ext_native_spans, Ext_raw_html),
ReaderOptions (readerExtensions, readerStripComments),
extensionEnabled)
import Text.Pandoc.Parsing hiding ((<|>))
import Text.Pandoc.Shared (addMetaField, crFilter, escapeURI, extractSpaces,
safeRead, underlineSpan)
import Text.Pandoc.Walk
import Text.Parsec.Error
import Text.TeXMath (readMathML, writeTeX)
readHtml :: PandocMonad m
=> ReaderOptions
-> Text
-> m Pandoc
readHtml opts inp = do
let tags = stripPrefixes . canonicalizeTags $
parseTagsOptions parseOptions{ optTagPosition = True }
(crFilter inp)
parseDoc = do
blocks <- (fixPlains False) . mconcat <$> manyTill block eof
meta <- stateMeta . parserState <$> getState
bs' <- replaceNotes (B.toList blocks)
reportLogMessages
return $ Pandoc meta bs'
getError (errorMessages -> ms) = case ms of
[] -> ""
(m:_) -> messageString m
result <- flip runReaderT def $
runParserT parseDoc
(HTMLState def{ stateOptions = opts } [] Nothing Set.empty M.empty [])
"source" tags
case result of
Right doc -> return doc
Left err -> throwError $ PandocParseError $ getError err
replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block]
replaceNotes = walkM replaceNotes'
replaceNotes' :: PandocMonad m => Inline -> TagParser m Inline
replaceNotes' (RawInline (Format "noteref") ref) = maybe (Str "") (Note . B.toList) . lookup ref <$> getNotes
where
getNotes = noteTable <$> getState
replaceNotes' x = return x
data HTMLState =
HTMLState
{ parserState :: ParserState,
noteTable :: [(String, Blocks)],
baseHref :: Maybe URI,
identifiers :: Set.Set String,
headerMap :: M.Map Inlines String,
logMessages :: [LogMessage]
}
data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext
, inChapter :: Bool
, inPlain :: Bool
}
setInChapter :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a
setInChapter = local (\s -> s {inChapter = True})
setInPlain :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a
setInPlain = local (\s -> s {inPlain = True})
type HTMLParser m s = ParserT s HTMLState (ReaderT HTMLLocal m)
type TagParser m = HTMLParser m [Tag Text]
pHtml :: PandocMonad m => TagParser m Blocks
pHtml = try $ do
(TagOpen "html" attr) <- lookAhead pAnyTag
for_ (lookup "lang" attr) $
updateState . B.setMeta "lang" . B.text . T.unpack
pInTags "html" block
pBody :: PandocMonad m => TagParser m Blocks
pBody = pInTags "body" block
pHead :: PandocMonad m => TagParser m Blocks
pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag)
where pTitle = pInTags "title" inline >>= setTitle . trimInlines
setTitle t = mempty <$ updateState (B.setMeta "title" t)
pMetaTag = do
mt <- pSatisfy (matchTagOpen "meta" [])
let name = T.unpack $ fromAttrib "name" mt
if null name
then return mempty
else do
let content = T.unpack $ fromAttrib "content" mt
updateState $ \s ->
let ps = parserState s in
s{ parserState = ps{
stateMeta = addMetaField name (B.text content)
(stateMeta ps) } }
return mempty
pBaseTag = do
bt <- pSatisfy (matchTagOpen "base" [])
updateState $ \st -> st{ baseHref =
parseURIReference $ T.unpack $ fromAttrib "href" bt }
return mempty
block :: PandocMonad m => TagParser m Blocks
block = do
res <- choice
[ eSection
, eSwitch B.para block
, mempty <$ eFootnote
, mempty <$ eTOC
, mempty <$ eTitlePage
, pPara
, pHeader
, pBlockQuote
, pCodeBlock
, pList
, pHrule
, pTable
, pHtml
, pHead
, pBody
, pDiv
, pPlain
, pFigure
, pRawHtmlBlock
]
trace (take 60 $ show $ B.toList res)
return res
namespaces :: PandocMonad m => [(String, TagParser m Inlines)]
namespaces = [(mathMLNamespace, pMath True)]
mathMLNamespace :: String
mathMLNamespace = "http://www.w3.org/1998/Math/MathML"
eSwitch :: (PandocMonad m, Monoid a)
=> (Inlines -> a)
-> TagParser m a
-> TagParser m a
eSwitch constructor parser = try $ do
guardEnabled Ext_epub_html_exts
pSatisfy (matchTagOpen "switch" [])
cases <- getFirst . mconcat <$>
manyTill (First <$> (eCase <* skipMany pBlank) )
(lookAhead $ try $ pSatisfy (matchTagOpen "default" []))
skipMany pBlank
fallback <- pInTags "default" (skipMany pBlank *> parser <* skipMany pBlank)
skipMany pBlank
pSatisfy (matchTagClose "switch")
return $ maybe fallback constructor cases
eCase :: PandocMonad m => TagParser m (Maybe Inlines)
eCase = do
skipMany pBlank
TagOpen _ attr' <- lookAhead $ pSatisfy $ (matchTagOpen "case" [])
let attr = toStringAttr attr'
case (flip lookup namespaces) =<< lookup "required-namespace" attr of
Just p -> Just <$> (pInTags "case" (skipMany pBlank *> p <* skipMany pBlank))
Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (matchTagClose "case"))
eFootnote :: PandocMonad m => TagParser m ()
eFootnote = try $ do
let notes = ["footnote", "rearnote"]
guardEnabled Ext_epub_html_exts
(TagOpen tag attr') <- lookAhead pAnyTag
let attr = toStringAttr attr'
guard (maybe False (flip elem notes) (lookup "type" attr))
let ident = fromMaybe "" (lookup "id" attr)
content <- pInTags tag block
addNote ident content
addNote :: PandocMonad m => String -> Blocks -> TagParser m ()
addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : (noteTable s)})
eNoteref :: PandocMonad m => TagParser m Inlines
eNoteref = try $ do
guardEnabled Ext_epub_html_exts
TagOpen tag attr' <- lookAhead $ pAnyTag
let attr = toStringAttr attr'
guard (maybe False (== "noteref") (lookup "type" attr))
let ident = maybe "" (dropWhile (== '#')) (lookup "href" attr)
guard (not (null ident))
pInTags tag block
return $ B.rawInline "noteref" ident
eTOC :: PandocMonad m => TagParser m ()
eTOC = try $ do
guardEnabled Ext_epub_html_exts
(TagOpen tag attr) <- lookAhead $ pAnyTag
guard (maybe False (== "toc") (lookup "type" attr))
void (pInTags tag block)
pList :: PandocMonad m => TagParser m Blocks
pList = pBulletList <|> pOrderedList <|> pDefinitionList
pBulletList :: PandocMonad m => TagParser m Blocks
pBulletList = try $ do
pSatisfy (matchTagOpen "ul" [])
let nonItem = pSatisfy (\t ->
not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) &&
not (matchTagClose "ul" t))
skipMany nonItem
items <- manyTill (pListItem nonItem) (pCloses "ul")
return $ B.bulletList $ map (fixPlains True) items
pListItem :: PandocMonad m => TagParser m a -> TagParser m Blocks
pListItem nonItem = do
TagOpen _ attr' <- lookAhead $ pSatisfy (matchTagOpen "li" [])
let attr = toStringAttr attr'
let addId ident bs = case B.toList bs of
(Plain ils:xs) -> B.fromList (Plain
[Span (ident, [], []) ils] : xs)
_ -> B.divWith (ident, [], []) bs
(maybe id addId (lookup "id" attr)) <$>
pInTags "li" block <* skipMany nonItem
parseListStyleType :: String -> ListNumberStyle
parseListStyleType "lower-roman" = LowerRoman
parseListStyleType "upper-roman" = UpperRoman
parseListStyleType "lower-alpha" = LowerAlpha
parseListStyleType "upper-alpha" = UpperAlpha
parseListStyleType "decimal" = Decimal
parseListStyleType _ = DefaultStyle
parseTypeAttr :: String -> ListNumberStyle
parseTypeAttr "i" = LowerRoman
parseTypeAttr "I" = UpperRoman
parseTypeAttr "a" = LowerAlpha
parseTypeAttr "A" = UpperAlpha
parseTypeAttr "1" = Decimal
parseTypeAttr _ = DefaultStyle
pOrderedList :: PandocMonad m => TagParser m Blocks
pOrderedList = try $ do
TagOpen _ attribs' <- pSatisfy (matchTagOpen "ol" [])
let attribs = toStringAttr attribs'
let (start, style) = (sta', sty')
where sta = fromMaybe "1" $
lookup "start" attribs
sta' = if all isDigit sta
then read sta
else 1
pickListStyle = pickStyleAttrProps ["list-style-type", "list-style"]
typeAttr = fromMaybe "" $ lookup "type" attribs
classAttr = fromMaybe "" $ lookup "class" attribs
styleAttr = fromMaybe "" $ lookup "style" attribs
listStyle = fromMaybe "" $ pickListStyle styleAttr
sty' = foldOrElse DefaultStyle
[ parseTypeAttr typeAttr
, parseListStyleType classAttr
, parseListStyleType listStyle
]
let nonItem = pSatisfy (\t ->
not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) &&
not (matchTagClose "ol" t))
skipMany nonItem
items <- manyTill (pListItem nonItem) (pCloses "ol")
return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items
pDefinitionList :: PandocMonad m => TagParser m Blocks
pDefinitionList = try $ do
pSatisfy (matchTagOpen "dl" [])
items <- manyTill pDefListItem (pCloses "dl")
return $ B.definitionList items
pDefListItem :: PandocMonad m => TagParser m (Inlines, [Blocks])
pDefListItem = try $ do
let nonItem = pSatisfy (\t -> not (matchTagOpen "dt" [] t) &&
not (matchTagOpen "dd" [] t) && not (matchTagClose "dl" t))
terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline)
defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block)
skipMany nonItem
let term = foldl1 (\x y -> x <> B.linebreak <> y) terms
return (term, map (fixPlains True) defs)
fixPlains :: Bool -> Blocks -> Blocks
fixPlains inList bs = if any isParaish bs'
then B.fromList $ map plainToPara bs'
else bs
where isParaish (Para _) = True
isParaish (CodeBlock _ _) = True
isParaish (Header _ _ _) = True
isParaish (BlockQuote _) = True
isParaish (BulletList _) = not inList
isParaish (OrderedList _ _) = not inList
isParaish (DefinitionList _) = not inList
isParaish _ = False
plainToPara (Plain xs) = Para xs
plainToPara x = x
bs' = B.toList bs
pRawTag :: PandocMonad m => TagParser m Text
pRawTag = do
tag <- pAnyTag
let ignorable x = x `elem` ["html","head","body","!DOCTYPE","?xml"]
if tagOpen ignorable (const True) tag || tagClose ignorable tag
then return mempty
else return $ renderTags' [tag]
pDiv :: PandocMonad m => TagParser m Blocks
pDiv = try $ do
guardEnabled Ext_native_divs
let isDivLike "div" = True
isDivLike "section" = True
isDivLike "main" = True
isDivLike _ = False
TagOpen tag attr' <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True)
let attr = toStringAttr attr'
contents <- pInTags tag block
let (ident, classes, kvs) = mkAttr attr
let classes' = if tag == "section"
then "section":classes
else classes
kvs' = if tag == "main" && isNothing (lookup "role" kvs)
then ("role", "main"):kvs
else kvs
return $ B.divWith (ident, classes', kvs') contents
pRawHtmlBlock :: PandocMonad m => TagParser m Blocks
pRawHtmlBlock = do
raw <- T.unpack <$> (pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag)
exts <- getOption readerExtensions
if extensionEnabled Ext_raw_html exts && not (null raw)
then return $ B.rawBlock "html" raw
else ignore raw
ignore :: (Monoid a, PandocMonad m) => String -> TagParser m a
ignore raw = do
pos <- getPosition
unless (null raw) $
logMessage $ SkippedContent raw pos
return mempty
pHtmlBlock :: PandocMonad m => Text -> TagParser m Text
pHtmlBlock t = try $ do
open <- pSatisfy (matchTagOpen t [])
contents <- manyTill pAnyTag (pSatisfy (matchTagClose t))
return $ renderTags' $ [open] <> contents <> [TagClose t]
eSection :: PandocMonad m => TagParser m Blocks
eSection = try $ do
let matchChapter as = maybe False (T.isInfixOf "chapter") (lookup "type" as)
let sectTag = tagOpen (`elem` sectioningContent) matchChapter
TagOpen tag _ <- lookAhead $ pSatisfy sectTag
setInChapter (pInTags tag block)
headerLevel :: PandocMonad m => Text -> TagParser m Int
headerLevel tagtype = do
case safeRead (T.unpack (T.drop 1 tagtype)) of
Just level ->
(try $ do
guardEnabled Ext_epub_html_exts
asks inChapter >>= guard
return (level 1))
<|>
return level
Nothing -> fail "Could not retrieve header level"
eTitlePage :: PandocMonad m => TagParser m ()
eTitlePage = try $ do
let isTitlePage as = maybe False (T.isInfixOf "titlepage") (lookup "type" as)
let groupTag = tagOpen (\x -> x `elem` groupingContent || x == "section")
isTitlePage
TagOpen tag _ <- lookAhead $ pSatisfy groupTag
() <$ pInTags tag block
pHeader :: PandocMonad m => TagParser m Blocks
pHeader = try $ do
TagOpen tagtype attr' <- pSatisfy $
tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"])
(const True)
let attr = toStringAttr attr'
let bodyTitle = TagOpen tagtype attr' ~== TagOpen ("h1" :: Text)
[("class","title")]
level <- headerLevel tagtype
contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof)
let ident = fromMaybe "" $ lookup "id" attr
let classes = maybe [] words $ lookup "class" attr
let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"]
attr'' <- registerHeader (ident, classes, keyvals) contents
return $ if bodyTitle
then mempty
else B.headerWith attr'' level contents
pHrule :: PandocMonad m => TagParser m Blocks
pHrule = do
pSelfClosing (=="hr") (const True)
return B.horizontalRule
pTable :: PandocMonad m => TagParser m Blocks
pTable = try $ do
TagOpen _ _ <- pSatisfy (matchTagOpen "table" [])
skipMany pBlank
caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank
widths' <- (mconcat <$> many1 pColgroup) <|> many pCol
let pTh = option [] $ pInTags "tr" (pCell "th")
pTr = try $ skipMany pBlank >>
pInTags "tr" (pCell "td" <|> pCell "th")
pTBody = pOptInTag "tbody" $ many1 pTr
head'' <- pOptInTag "thead" pTh
head' <- map snd <$>
(pOptInTag "tbody" $
if null head'' then pTh else return head'')
rowsLs <- many pTBody
rows' <- pOptInTag "tfoot" $ many pTr
TagClose _ <- pSatisfy (matchTagClose "table")
let rows'' = (concat rowsLs) <> rows'
let rows''' = map (map snd) rows''
guard $ not $ null head' && null rows'''
let isSinglePlain x = case B.toList x of
[] -> True
[Plain _] -> True
_ -> False
let isSimple = all isSinglePlain $ concat (head':rows''')
let cols = length $ if null head' then head rows''' else head'
let addEmpties r = case cols length r of
n | n > 0 -> r <> replicate n mempty
| otherwise -> r
let rows = map addEmpties rows'''
let aligns = case rows'' of
(cs:_) -> map fst cs
_ -> replicate cols AlignDefault
let widths = if null widths'
then if isSimple
then replicate cols 0
else replicate cols (1.0 / fromIntegral cols)
else widths'
return $ B.table caption (zip aligns widths) head' rows
pCol :: PandocMonad m => TagParser m Double
pCol = try $ do
TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" [])
let attribs = toStringAttr attribs'
skipMany pBlank
optional $ pSatisfy (matchTagClose "col")
skipMany pBlank
return $ case lookup "width" attribs of
Nothing -> case lookup "style" attribs of
Just ('w':'i':'d':'t':'h':':':xs) | '%' `elem` xs ->
fromMaybe 0.0 $ safeRead ('0':'.':filter
(`notElem` (" \t\r\n%'\";" :: [Char])) xs)
_ -> 0.0
Just x | not (null x) && last x == '%' ->
fromMaybe 0.0 $ safeRead ('0':'.':init x)
_ -> 0.0
pColgroup :: PandocMonad m => TagParser m [Double]
pColgroup = try $ do
pSatisfy (matchTagOpen "colgroup" [])
skipMany pBlank
manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
noColOrRowSpans :: Tag Text -> Bool
noColOrRowSpans t = isNullOrOne "colspan" && isNullOrOne "rowspan"
where isNullOrOne x = case fromAttrib x t of
"" -> True
"1" -> True
_ -> False
pCell :: PandocMonad m => Text -> TagParser m [(Alignment, Blocks)]
pCell celltype = try $ do
skipMany pBlank
tag <- lookAhead $
pSatisfy (\t -> t ~== TagOpen celltype [] && noColOrRowSpans t)
let extractAlign' [] = ""
extractAlign' ("text-align":x:_) = x
extractAlign' (_:xs) = extractAlign' xs
let extractAlign = extractAlign' . wordsBy (`elem` [' ','\t',';',':'])
let align = case maybeFromAttrib "align" tag `mplus`
(extractAlign <$> maybeFromAttrib "style" tag) of
Just "left" -> AlignLeft
Just "right" -> AlignRight
Just "center" -> AlignCenter
_ -> AlignDefault
res <- pInTags' celltype noColOrRowSpans block
skipMany pBlank
return [(align, res)]
pBlockQuote :: PandocMonad m => TagParser m Blocks
pBlockQuote = do
contents <- pInTags "blockquote" block
return $ B.blockQuote $ fixPlains False contents
pPlain :: PandocMonad m => TagParser m Blocks
pPlain = do
contents <- setInPlain $ trimInlines . mconcat <$> many1 inline
if B.isNull contents
then return mempty
else return $ B.plain contents
pPara :: PandocMonad m => TagParser m Blocks
pPara = do
contents <- trimInlines <$> pInTags "p" inline
return $ B.para contents
pFigure :: PandocMonad m => TagParser m Blocks
pFigure = try $ do
TagOpen _ _ <- pSatisfy (matchTagOpen "figure" [])
skipMany pBlank
let pImg = (\x -> (Just x, Nothing)) <$>
(pOptInTag "p" pImage <* skipMany pBlank)
pCapt = (\x -> (Nothing, Just x)) <$>
(pInTags "figcaption" inline <* skipMany pBlank)
pSkip = (Nothing, Nothing) <$ pSatisfy (not . matchTagClose "figure")
res <- many (pImg <|> pCapt <|> pSkip)
let mbimg = msum $ map fst res
let mbcap = msum $ map snd res
TagClose _ <- pSatisfy (matchTagClose "figure")
let caption = fromMaybe mempty mbcap
case B.toList <$> mbimg of
Just [Image attr _ (url, tit)] ->
return $ B.para $ B.imageWith attr url ("fig:" ++ tit) caption
_ -> mzero
pCodeBlock :: PandocMonad m => TagParser m Blocks
pCodeBlock = try $ do
TagOpen _ attr' <- pSatisfy (matchTagOpen "pre" [])
let attr = toStringAttr attr'
contents <- manyTill pAnyTag (pCloses "pre" <|> eof)
let rawText = concatMap tagToString contents
let result' = case rawText of
'\n':xs -> xs
_ -> rawText
let result = case reverse result' of
'\n':_ -> init result'
_ -> result'
return $ B.codeBlockWith (mkAttr attr) result
tagToString :: Tag Text -> String
tagToString (TagText s) = T.unpack s
tagToString (TagOpen "br" _) = "\n"
tagToString _ = ""
inline :: PandocMonad m => TagParser m Inlines
inline = choice
[ eNoteref
, eSwitch id inline
, pTagText
, pQ
, pEmph
, pStrong
, pSuperscript
, pSubscript
, pStrikeout
, pUnderline
, pLineBreak
, pLink
, pImage
, pCode
, pSpan
, pMath False
, pRawHtmlInline
]
pLocation :: PandocMonad m => TagParser m ()
pLocation = do
(TagPosition r c) <- pSat isTagPosition
setPosition $ newPos "input" r c
pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
pSat f = do
pos <- getPosition
token show (const pos) (\x -> if f x then Just x else Nothing)
pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
pSatisfy f = try $ optional pLocation >> pSat f
pAnyTag :: PandocMonad m => TagParser m (Tag Text)
pAnyTag = pSatisfy (const True)
pSelfClosing :: PandocMonad m
=> (Text -> Bool) -> ([Attribute Text] -> Bool)
-> TagParser m (Tag Text)
pSelfClosing f g = do
open <- pSatisfy (tagOpen f g)
optional $ pSatisfy (tagClose f)
return open
pQ :: PandocMonad m => TagParser m Inlines
pQ = do
context <- asks quoteContext
let quoteType = case context of
InDoubleQuote -> SingleQuote
_ -> DoubleQuote
let innerQuoteContext = if quoteType == SingleQuote
then InSingleQuote
else InDoubleQuote
let constructor = case quoteType of
SingleQuote -> B.singleQuoted
DoubleQuote -> B.doubleQuoted
withQuoteContext innerQuoteContext $
pInlinesInTags "q" constructor
pEmph :: PandocMonad m => TagParser m Inlines
pEmph = pInlinesInTags "em" B.emph <|> pInlinesInTags "i" B.emph
pStrong :: PandocMonad m => TagParser m Inlines
pStrong = pInlinesInTags "strong" B.strong <|> pInlinesInTags "b" B.strong
pSuperscript :: PandocMonad m => TagParser m Inlines
pSuperscript = pInlinesInTags "sup" B.superscript
pSubscript :: PandocMonad m => TagParser m Inlines
pSubscript = pInlinesInTags "sub" B.subscript
pStrikeout :: PandocMonad m => TagParser m Inlines
pStrikeout = do
pInlinesInTags "s" B.strikeout <|>
pInlinesInTags "strike" B.strikeout <|>
pInlinesInTags "del" B.strikeout <|>
try (do pSatisfy (matchTagOpen "span" [("class","strikeout")])
contents <- mconcat <$> manyTill inline (pCloses "span")
return $ B.strikeout contents)
pUnderline :: PandocMonad m => TagParser m Inlines
pUnderline = pInlinesInTags "u" underlineSpan <|> pInlinesInTags "ins" underlineSpan
pLineBreak :: PandocMonad m => TagParser m Inlines
pLineBreak = do
pSelfClosing (=="br") (const True)
return B.linebreak
maybeFromAttrib :: String -> Tag Text -> Maybe String
maybeFromAttrib name (TagOpen _ attrs) =
T.unpack <$> lookup (T.pack name) attrs
maybeFromAttrib _ _ = Nothing
pLink :: PandocMonad m => TagParser m Inlines
pLink = try $ do
tag <- pSatisfy $ tagOpenLit "a" (const True)
let title = T.unpack $ fromAttrib "title" tag
let uid = maybe (T.unpack $ fromAttrib "name" tag) id $
maybeFromAttrib "id" tag
let cls = words $ T.unpack $ fromAttrib "class" tag
lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a")
case maybeFromAttrib "href" tag of
Nothing ->
return $ B.spanWith (uid, cls, []) lab
Just url' -> do
mbBaseHref <- baseHref <$> getState
let url = case (parseURIReference url', mbBaseHref) of
(Just rel, Just bs) ->
show (rel `nonStrictRelativeTo` bs)
_ -> url'
return $ B.linkWith (uid, cls, []) (escapeURI url) title lab
pImage :: PandocMonad m => TagParser m Inlines
pImage = do
tag <- pSelfClosing (=="img") (isJust . lookup "src")
mbBaseHref <- baseHref <$> getState
let url' = T.unpack $ fromAttrib "src" tag
let url = case (parseURIReference url', mbBaseHref) of
(Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs)
_ -> url'
let title = T.unpack $ fromAttrib "title" tag
let alt = T.unpack $ fromAttrib "alt" tag
let uid = T.unpack $ fromAttrib "id" tag
let cls = words $ T.unpack $ fromAttrib "class" tag
let getAtt k = case fromAttrib k tag of
"" -> []
v -> [(T.unpack k, T.unpack v)]
let kvs = concat $ map getAtt ["width", "height", "sizes", "srcset"]
return $ B.imageWith (uid, cls, kvs) (escapeURI url) title (B.text alt)
pCode :: PandocMonad m => TagParser m Inlines
pCode = try $ do
(TagOpen open attr') <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True)
let attr = toStringAttr attr'
result <- manyTill pAnyTag (pCloses open)
return $ B.codeWith (mkAttr attr) $ intercalate " " $ lines $ T.unpack $
innerText result
pSpan :: PandocMonad m => TagParser m Inlines
pSpan = try $ do
guardEnabled Ext_native_spans
TagOpen _ attr' <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True)
let attr = toStringAttr attr'
contents <- pInTags "span" inline
let isSmallCaps = fontVariant == "small-caps" || "smallcaps" `elem` classes
where styleAttr = fromMaybe "" $ lookup "style" attr
fontVariant = fromMaybe "" $ pickStyleAttrProps ["font-variant"] styleAttr
classes = fromMaybe [] $
words <$> lookup "class" attr
let tag = if isSmallCaps then B.smallcaps else B.spanWith (mkAttr attr)
return $ tag contents
pRawHtmlInline :: PandocMonad m => TagParser m Inlines
pRawHtmlInline = do
inplain <- asks inPlain
result <- pSatisfy (tagComment (const True))
<|> if inplain
then pSatisfy (not . isBlockTag)
else pSatisfy isInlineTag
exts <- getOption readerExtensions
let raw = T.unpack $ renderTags' [result]
if extensionEnabled Ext_raw_html exts
then return $ B.rawInline "html" raw
else ignore raw
mathMLToTeXMath :: String -> Either String String
mathMLToTeXMath s = writeTeX <$> readMathML s
toStringAttr :: [(Text, Text)] -> [(String, String)]
toStringAttr = map go
where go (x,y) = (T.unpack x, T.unpack y)
pMath :: PandocMonad m => Bool -> TagParser m Inlines
pMath inCase = try $ do
open@(TagOpen _ attr') <- pSatisfy $ tagOpen (=="math") (const True)
let attr = toStringAttr attr'
unless inCase $
guard (maybe True (== mathMLNamespace) (lookup "xmlns" attr))
contents <- manyTill pAnyTag (pSatisfy (matchTagClose "math"))
case mathMLToTeXMath (T.unpack $ renderTags $
[open] <> contents <> [TagClose "math"]) of
Left _ -> return $ B.spanWith ("",["math"],attr) $ B.text $
T.unpack $ innerText contents
Right [] -> return mempty
Right x -> return $ case lookup "display" attr of
Just "block" -> B.displayMath x
_ -> B.math x
pInlinesInTags :: PandocMonad m => Text -> (Inlines -> Inlines)
-> TagParser m Inlines
pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline
pInTags :: (PandocMonad m, Monoid a) => Text -> TagParser m a -> TagParser m a
pInTags tagtype parser = pInTags' tagtype (const True) parser
pInTags' :: (PandocMonad m, Monoid a)
=> Text
-> (Tag Text -> Bool)
-> TagParser m a
-> TagParser m a
pInTags' tagtype tagtest parser = try $ do
pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t)
mconcat <$> manyTill parser (pCloses tagtype <|> eof)
pOptInTag :: PandocMonad m => Text -> TagParser m a -> TagParser m a
pOptInTag tagtype p = try $ do
skipMany pBlank
optional $ pSatisfy (matchTagOpen tagtype [])
skipMany pBlank
x <- p
skipMany pBlank
optional $ pSatisfy (matchTagClose tagtype)
skipMany pBlank
return x
pCloses :: PandocMonad m => Text -> TagParser m ()
pCloses tagtype = try $ do
t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag
case t of
(TagClose t') | t' == tagtype -> pAnyTag >> return ()
(TagOpen t' _) | t' `closes` tagtype -> return ()
(TagClose "ul") | tagtype == "li" -> return ()
(TagClose "ol") | tagtype == "li" -> return ()
(TagClose "dl") | tagtype == "dd" -> return ()
(TagClose "table") | tagtype == "td" -> return ()
(TagClose "table") | tagtype == "tr" -> return ()
(TagClose "td") | tagtype `Set.member` blockHtmlTags -> return ()
(TagClose "th") | tagtype `Set.member` blockHtmlTags -> return ()
(TagClose t') | tagtype == "p" && t' `Set.member` blockHtmlTags
-> return ()
_ -> mzero
pTagText :: PandocMonad m => TagParser m Inlines
pTagText = try $ do
(TagText str) <- pSatisfy isTagText
st <- getState
qu <- ask
parsed <- lift $ lift $
flip runReaderT qu $ runParserT (many pTagContents) st "text" str
case parsed of
Left _ -> throwError $ PandocParseError $ "Could not parse `" <> T.unpack str <> "'"
Right result -> return $ mconcat result
pBlank :: PandocMonad m => TagParser m ()
pBlank = try $ do
(TagText str) <- pSatisfy isTagText
guard $ T.all isSpace str
type InlinesParser m = HTMLParser m Text
pTagContents :: PandocMonad m => InlinesParser m Inlines
pTagContents =
B.displayMath <$> mathDisplay
<|> B.math <$> mathInline
<|> pStr
<|> pSpace
<|> smartPunctuation pTagContents
<|> pSymbol
<|> pBad
pStr :: PandocMonad m => InlinesParser m Inlines
pStr = do
result <- many1 $ satisfy $ \c ->
not (isSpace c) && not (isSpecial c) && not (isBad c)
updateLastStrPos
return $ B.str result
isSpecial :: Char -> Bool
isSpecial '"' = True
isSpecial '\'' = True
isSpecial '.' = True
isSpecial '-' = True
isSpecial '$' = True
isSpecial '\8216' = True
isSpecial '\8217' = True
isSpecial '\8220' = True
isSpecial '\8221' = True
isSpecial _ = False
pSymbol :: PandocMonad m => InlinesParser m Inlines
pSymbol = satisfy isSpecial >>= return . B.str . (:[])
isBad :: Char -> Bool
isBad c = c >= '\128' && c <= '\159'
pBad :: PandocMonad m => InlinesParser m Inlines
pBad = do
c <- satisfy isBad
let c' = case c of
'\128' -> '\8364'
'\130' -> '\8218'
'\131' -> '\402'
'\132' -> '\8222'
'\133' -> '\8230'
'\134' -> '\8224'
'\135' -> '\8225'
'\136' -> '\710'
'\137' -> '\8240'
'\138' -> '\352'
'\139' -> '\8249'
'\140' -> '\338'
'\142' -> '\381'
'\145' -> '\8216'
'\146' -> '\8217'
'\147' -> '\8220'
'\148' -> '\8221'
'\149' -> '\8226'
'\150' -> '\8211'
'\151' -> '\8212'
'\152' -> '\732'
'\153' -> '\8482'
'\154' -> '\353'
'\155' -> '\8250'
'\156' -> '\339'
'\158' -> '\382'
'\159' -> '\376'
_ -> '?'
return $ B.str [c']
pSpace :: PandocMonad m => InlinesParser m Inlines
pSpace = many1 (satisfy isSpace) >>= \xs ->
if '\n' `elem` xs
then return B.softbreak
else return B.space
eitherBlockOrInline :: Set.Set Text
eitherBlockOrInline = Set.fromList
["audio", "applet", "button", "iframe", "embed",
"del", "ins", "progress", "map", "area", "noscript", "script",
"object", "svg", "video", "source"]
blockHtmlTags :: Set.Set Text
blockHtmlTags = Set.fromList
["?xml", "!DOCTYPE", "address", "article", "aside",
"blockquote", "body", "canvas",
"caption", "center", "col", "colgroup", "dd", "details",
"dir", "div", "dl", "dt", "fieldset", "figcaption", "figure",
"footer", "form", "h1", "h2", "h3", "h4",
"h5", "h6", "head", "header", "hgroup", "hr", "html",
"isindex", "main", "menu", "meta", "noframes", "ol", "output", "p", "pre",
"section", "table", "tbody", "textarea",
"thead", "tfoot", "ul", "dd",
"dt", "frameset", "li", "tbody", "td", "tfoot",
"th", "thead", "tr", "script", "style"]
blockDocBookTags :: Set.Set Text
blockDocBookTags = Set.fromList
["calloutlist", "bibliolist", "glosslist", "itemizedlist",
"orderedlist", "segmentedlist", "simplelist",
"variablelist", "caution", "important", "note", "tip",
"warning", "address", "literallayout", "programlisting",
"programlistingco", "screen", "screenco", "screenshot",
"synopsis", "example", "informalexample", "figure",
"informalfigure", "table", "informaltable", "para",
"simpara", "formalpara", "equation", "informalequation",
"figure", "screenshot", "mediaobject", "qandaset",
"procedure", "task", "cmdsynopsis", "funcsynopsis",
"classsynopsis", "blockquote", "epigraph", "msgset",
"sidebar", "title"]
epubTags :: Set.Set Text
epubTags = Set.fromList ["case", "switch", "default"]
blockTags :: Set.Set Text
blockTags = Set.unions [blockHtmlTags, blockDocBookTags, epubTags]
class NamedTag a where
getTagName :: a -> Maybe Text
instance NamedTag (Tag Text) where
getTagName (TagOpen t _) = Just t
getTagName (TagClose t) = Just t
getTagName _ = Nothing
instance NamedTag (Tag String) where
getTagName (TagOpen t _) = Just (T.pack t)
getTagName (TagClose t) = Just (T.pack t)
getTagName _ = Nothing
isInlineTag :: NamedTag (Tag a) => Tag a -> Bool
isInlineTag t = isInlineTagName || isCommentTag t
where isInlineTagName = case getTagName t of
Just x -> x
`Set.notMember` blockTags
Nothing -> False
isBlockTag :: NamedTag (Tag a) => Tag a -> Bool
isBlockTag t = isBlockTagName || isTagComment t
where isBlockTagName =
case getTagName t of
Just x
| "?" `T.isPrefixOf` x -> True
| "!" `T.isPrefixOf` x -> True
| otherwise -> x `Set.member` blockTags
|| x `Set.member` eitherBlockOrInline
Nothing -> False
isTextTag :: Tag a -> Bool
isTextTag = tagText (const True)
isCommentTag :: Tag a -> Bool
isCommentTag = tagComment (const True)
closes :: Text -> Text -> Bool
_ `closes` "body" = False
_ `closes` "html" = False
"body" `closes` "head" = True
"a" `closes` "a" = True
"li" `closes` "li" = True
"th" `closes` t | t `elem` ["th","td"] = True
"td" `closes` t | t `elem` ["th","td"] = True
"tr" `closes` t | t `elem` ["th","td","tr"] = True
"dd" `closes` t | t `elem` ["dt", "dd"] = True
"dt" `closes` t | t `elem` ["dt","dd"] = True
"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True
"optgroup" `closes` "optgroup" = True
"optgroup" `closes` "option" = True
"option" `closes` "option" = True
x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote",
"dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4",
"h5", "h6", "header", "hr", "main", "menu", "nav", "ol", "p", "pre", "section",
"table", "ul"] = True
_ `closes` "meta" = True
"form" `closes` "form" = True
"label" `closes` "label" = True
"map" `closes` "map" = True
"object" `closes` "object" = True
_ `closes` t | t `elem` ["option","style","script","textarea","title"] = True
t `closes` "select" | t /= "option" = True
"thead" `closes` t | t `elem` ["colgroup"] = True
"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True
"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True
t `closes` t2 |
t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","main","p"] &&
t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True
t1 `closes` t2 |
t1 `Set.member` blockTags &&
t2 `Set.notMember` blockTags &&
t2 `Set.notMember` eitherBlockOrInline = True
_ `closes` _ = False
htmlInBalanced :: (HasReaderOptions st, Monad m)
=> (Tag String -> Bool)
-> ParserT String st m String
htmlInBalanced f = try $ do
lookAhead (char '<')
inp <- getInput
let ts = canonicalizeTags $
parseTagsOptions parseOptions{ optTagWarning = True,
optTagPosition = True } inp
case ts of
(TagPosition sr sc : t@(TagOpen tn _) : rest) -> do
guard $ f t
guard $ not $ hasTagWarning (t : take 1 rest)
case htmlInBalanced' tn (t:rest) of
[] -> mzero
xs -> case reverse xs of
(TagClose _ : TagPosition er ec : _) -> do
let ls = er sr
let cs = ec sc
lscontents <- unlines <$> count ls anyLine
cscontents <- count cs anyChar
closetag <- do
x <- many (satisfy (/='>'))
char '>'
return (x <> ">")
return (lscontents <> cscontents <> closetag)
_ -> mzero
_ -> mzero
htmlInBalanced' :: String
-> [Tag String]
-> [Tag String]
htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts
where go :: Int -> [Tag String] -> Maybe [Tag String]
go n (t@(TagOpen tn' _):rest) | tn' == tagname =
(t :) <$> go (n + 1) rest
go 1 (t@(TagClose tn'):_) | tn' == tagname =
return [t]
go n (t@(TagClose tn'):rest) | tn' == tagname =
(t :) <$> go (n 1) rest
go n (t:ts') = (t :) <$> go n ts'
go _ [] = mzero
hasTagWarning :: [Tag a] -> Bool
hasTagWarning (TagWarning _:_) = True
hasTagWarning _ = False
htmlTag :: (HasReaderOptions st, Monad m)
=> (Tag String -> Bool)
-> ParserT [Char] st m (Tag String, String)
htmlTag f = try $ do
lookAhead (char '<')
startpos <- getPosition
inp <- getInput
let ts = canonicalizeTags $ parseTagsOptions
parseOptions{ optTagWarning = False
, optTagPosition = True }
(inp ++ " ")
(next, ln, col) <- case ts of
(TagPosition{} : next : TagPosition ln col : _)
| f next -> return (next, ln, col)
_ -> mzero
let isNameChar c = isAlphaNum c || c == ':' || c == '-' || c == '_'
let isName s = case s of
[] -> False
(c:cs) -> isLetter c && all isNameChar cs
let endpos = if ln == 1
then setSourceColumn startpos
(sourceColumn startpos + (col 1))
else setSourceColumn (setSourceLine startpos
(sourceLine startpos + (ln 1)))
col
let endAngle = try $
do char '>'
pos <- getPosition
guard $ pos >= endpos
let handleTag tagname = do
guard $ isName tagname
guard $ not $ null tagname
guard $ last tagname /= ':'
char '<'
rendered <- manyTill anyChar endAngle
return (next, "<" ++ rendered ++ ">")
case next of
TagComment s
| "<!--" `isPrefixOf` inp -> do
string "<!--"
count (length s) anyChar
string "-->"
stripComments <- getOption readerStripComments
if stripComments
then return (next, "")
else return (next, "<!--" <> s <> "-->")
| otherwise -> fail "bogus comment mode, HTML5 parse error"
TagOpen tagname attr -> do
guard $ all (isName . fst) attr
handleTag tagname
TagClose tagname ->
handleTag tagname
_ -> mzero
mkAttr :: [(String, String)] -> Attr
mkAttr attr = (attribsId, attribsClasses, attribsKV)
where attribsId = fromMaybe "" $ lookup "id" attr
attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) <> epubTypes
attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr
stripPrefixes :: [Tag Text] -> [Tag Text]
stripPrefixes = map stripPrefix
stripPrefix :: Tag Text -> Tag Text
stripPrefix (TagOpen s as) =
TagOpen (stripPrefix' s) (map (stripPrefix' *** id) as)
stripPrefix (TagClose s) = TagClose (stripPrefix' s)
stripPrefix x = x
stripPrefix' :: Text -> Text
stripPrefix' s =
if T.null t then s else T.drop 1 t
where (_, t) = T.span (/= ':') s
isSpace :: Char -> Bool
isSpace ' ' = True
isSpace '\t' = True
isSpace '\n' = True
isSpace '\r' = True
isSpace _ = False
instance HasIdentifierList HTMLState where
extractIdentifierList = identifiers
updateIdentifierList f s = s{ identifiers = f (identifiers s) }
instance HasHeaderMap HTMLState where
extractHeaderMap = headerMap
updateHeaderMap f s = s{ headerMap = f (headerMap s) }
instance HasLogMessages HTMLState where
addLogMessage m s = s{ logMessages = m : logMessages s }
getLogMessages = reverse . logMessages
instance PandocMonad m => HasQuoteContext HTMLState (ReaderT HTMLLocal m) where
getQuoteContext = asks quoteContext
withQuoteContext q = local (\s -> s{quoteContext = q})
instance HasReaderOptions HTMLState where
extractReaderOptions = extractReaderOptions . parserState
instance HasMeta HTMLState where
setMeta s b st = st {parserState = setMeta s b $ parserState st}
deleteMeta s st = st {parserState = deleteMeta s $ parserState st}
instance Default HTMLLocal where
def = HTMLLocal NoQuote False False
instance HasLastStrPosition HTMLState where
setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)}
getLastStrPos = getLastStrPos . parserState
renderTags' :: [Tag Text] -> Text
renderTags' = renderTagsOptions
renderOptions{ optMinimize = matchTags ["hr", "br", "img",
"meta", "link"]
, optRawTag = matchTags ["script", "style"] }
where matchTags tags = flip elem tags . T.toLower
sectioningContent :: [Text]
sectioningContent = ["article", "aside", "nav", "section"]
groupingContent :: [Text]
groupingContent = ["p", "hr", "pre", "blockquote", "ol"
, "ul", "li", "dl", "dt", "dt", "dd"
, "figure", "figcaption", "div", "main"]
matchTagClose :: Text -> (Tag Text -> Bool)
matchTagClose t = (~== TagClose t)
matchTagOpen :: Text -> [(Text, Text)] -> (Tag Text -> Bool)
matchTagOpen t as = (~== TagOpen t as)