{-# LANGUAGE OverloadedStrings #-}
module Cheapskate.Parse (
markdown
) where
import Cheapskate.ParserCombinators
import Cheapskate.Util
import Cheapskate.Inlines
import Cheapskate.Types
import Data.Char hiding (Space)
import qualified Data.Set as Set
import Prelude hiding (takeWhile)
import Data.Maybe (mapMaybe)
import Data.Text (Text)
import qualified Data.Text as T
import Data.Monoid
import Data.Foldable (toList)
import Data.Sequence ((|>), viewr, ViewR(..), singleton, Seq)
import qualified Data.Sequence as Seq
import Control.Monad.RWS
import Control.Applicative
import qualified Data.Map as M
import Data.List (intercalate)
import Debug.Trace
markdown :: Options -> Text -> Doc
markdown opts
| debug opts = (\x -> trace (show x) $ Doc opts mempty) . processLines
| otherwise = Doc opts . processDocument . processLines
data ContainerStack =
ContainerStack Container [Container]
type LineNumber = Int
data Elt = C Container
| L LineNumber Leaf
deriving Show
data Container = Container{
containerType :: ContainerType
, children :: Seq Elt
}
data ContainerType = Document
| BlockQuote
| ListItem { markerColumn :: Int
, padding :: Int
, listType :: ListType }
| FencedCode { startColumn :: Int
, fence :: Text
, info :: Text }
| IndentedCode
| RawHtmlBlock
| Reference
deriving (Eq, Show)
instance Show Container where
show c = show (containerType c) ++ "\n" ++
nest 2 (intercalate "\n" (map showElt $ toList $ children c))
nest :: Int -> String -> String
nest num = intercalate "\n" . map ((replicate num ' ') ++) . lines
showElt :: Elt -> String
showElt (C c) = show c
showElt (L _ (TextLine s)) = show s
showElt (L _ lf) = show lf
containerContinue :: Container -> Scanner
containerContinue c =
case containerType c of
BlockQuote -> scanNonindentSpace *> scanBlockquoteStart
IndentedCode -> scanIndentSpace
FencedCode{startColumn = col} ->
scanSpacesToColumn col
RawHtmlBlock -> nfb scanBlankline
li@ListItem{} -> scanBlankline
<|>
(do scanSpacesToColumn
(markerColumn li + 1)
upToCountChars (padding li - 1)
(==' ')
return ())
Reference{} -> nfb scanBlankline >>
nfb (scanNonindentSpace *> scanReference)
_ -> return ()
{-# INLINE containerContinue #-}
containerStart :: Bool -> Parser ContainerType
containerStart _lastLineIsText = scanNonindentSpace *>
( (BlockQuote <$ scanBlockquoteStart)
<|> parseListMarker
)
verbatimContainerStart :: Bool -> Parser ContainerType
verbatimContainerStart lastLineIsText = scanNonindentSpace *>
( parseCodeFence
<|> (guard (not lastLineIsText) *> (IndentedCode <$ char ' ' <* nfb scanBlankline))
<|> (guard (not lastLineIsText) *> (RawHtmlBlock <$ parseHtmlBlockStart))
<|> (guard (not lastLineIsText) *> (Reference <$ scanReference))
)
data Leaf = TextLine Text
| BlankLine Text
| ATXHeader Int Text
| SetextHeader Int Text
| Rule
deriving (Show)
type ContainerM = RWS () ReferenceMap ContainerStack
closeStack :: ContainerM Container
closeStack = do
ContainerStack top rest <- get
if null rest
then return top
else closeContainer >> closeStack
closeContainer :: ContainerM ()
closeContainer = do
ContainerStack top rest <- get
case top of
(Container Reference{} cs'') ->
case parse pReference
(T.strip $ joinLines $ map extractText $ toList cs'') of
Right (lab, lnk, tit) -> do
tell (M.singleton (normalizeReference lab) (lnk, tit))
case rest of
(Container ct' cs' : rs) ->
put $ ContainerStack (Container ct' (cs' |> C top)) rs
[] -> return ()
Left _ ->
case rest of
(c:cs) -> put $ ContainerStack c cs
[] -> return ()
(Container li@ListItem{} cs'') ->
case rest of
(Container ct' cs' : rs) ->
case viewr cs'' of
(zs :> b@(L _ BlankLine{})) ->
put $ ContainerStack
(if Seq.null zs
then Container ct' (cs' |> C (Container li zs))
else Container ct' (cs' |>
C (Container li zs) |> b)) rs
_ -> put $ ContainerStack (Container ct' (cs' |> C top)) rs
[] -> return ()
_ -> case rest of
(Container ct' cs' : rs) ->
put $ ContainerStack (Container ct' (cs' |> C top)) rs
[] -> return ()
addLeaf :: LineNumber -> Leaf -> ContainerM ()
addLeaf lineNum lf = do
ContainerStack top rest <- get
case (top, lf) of
(Container ct@(ListItem{}) cs, BlankLine{}) ->
case viewr cs of
(_ :> L _ BlankLine{}) ->
closeContainer >> addLeaf lineNum lf
_ -> put $ ContainerStack (Container ct (cs |> L lineNum lf)) rest
(Container ct cs, _) ->
put $ ContainerStack (Container ct (cs |> L lineNum lf)) rest
addContainer :: ContainerType -> ContainerM ()
addContainer ct = modify $ \(ContainerStack top rest) ->
ContainerStack (Container ct mempty) (top:rest)
processDocument :: (Container, ReferenceMap) -> Blocks
processDocument (Container ct cs, refmap) =
case ct of
Document -> processElts refmap (toList cs)
_ -> error "top level container is not Document"
processElts :: ReferenceMap -> [Elt] -> Blocks
processElts _ [] = mempty
processElts refmap (L _lineNumber lf : rest) =
case lf of
TextLine t -> singleton (Para $ parseInlines refmap txt) <>
processElts refmap rest'
where txt = T.stripEnd $ joinLines $ map T.stripStart
$ t : map extractText textlines
(textlines, rest') = span isTextLine rest
isTextLine (L _ (TextLine _)) = True
isTextLine _ = False
BlankLine{} -> processElts refmap rest
ATXHeader lvl t -> singleton (Header lvl $ parseInlines refmap t) <>
processElts refmap rest
SetextHeader lvl t -> singleton (Header lvl $ parseInlines refmap t) <>
processElts refmap rest
Rule -> singleton HRule <> processElts refmap rest
processElts refmap (C (Container ct cs) : rest) =
case ct of
Document -> error "Document container found inside Document"
BlockQuote -> singleton (Blockquote $ processElts refmap (toList cs)) <>
processElts refmap rest
ListItem { listType = listType' } ->
singleton (List isTight listType' items') <> processElts refmap rest'
where xs = takeListItems rest
rest' = drop (length xs) rest
takeListItems
(C c@(Container ListItem { listType = lt' } _) : zs)
| listTypesMatch lt' listType' = C c : takeListItems zs
takeListItems (lf@(L _ (BlankLine _)) :
c@(C (Container ListItem { listType = lt' } _)) : zs)
| listTypesMatch lt' listType' = lf : c : takeListItems zs
takeListItems _ = []
listTypesMatch (Bullet c1) (Bullet c2) = c1 == c2
listTypesMatch (Numbered w1 _) (Numbered w2 _) = w1 == w2
listTypesMatch _ _ = False
items = mapMaybe getItem (Container ct cs : [c | C c <- xs])
getItem (Container ListItem{} cs') = Just $ toList cs'
getItem _ = Nothing
items' = map (processElts refmap) items
isTight = tightListItem xs && all tightListItem items
FencedCode _ _ info' -> singleton (CodeBlock attr txt) <>
processElts refmap rest
where txt = joinLines $ map extractText $ toList cs
attr = CodeAttr x (T.strip y)
(x,y) = T.break (==' ') info'
IndentedCode -> singleton (CodeBlock (CodeAttr "" "") txt)
<> processElts refmap rest'
where txt = joinLines $ stripTrailingEmpties
$ concatMap extractCode cbs
stripTrailingEmpties = reverse .
dropWhile (T.all (==' ')) . reverse
extractCode (L _ (BlankLine t)) = [T.drop 1 t]
extractCode (C (Container IndentedCode cs')) =
map extractText $ toList cs'
extractCode _ = []
(cbs, rest') = span isIndentedCodeOrBlank
(C (Container ct cs) : rest)
isIndentedCodeOrBlank (L _ BlankLine{}) = True
isIndentedCodeOrBlank (C (Container IndentedCode _))
= True
isIndentedCodeOrBlank _ = False
RawHtmlBlock -> singleton (HtmlBlock txt) <> processElts refmap rest
where txt = joinLines (map extractText (toList cs))
Reference{} -> processElts refmap rest
where isBlankLine (L _ BlankLine{}) = True
isBlankLine _ = False
tightListItem [] = True
tightListItem xs = not $ any isBlankLine xs
extractText :: Elt -> Text
extractText (L _ (TextLine t)) = t
extractText _ = mempty
processLines :: Text -> (Container, ReferenceMap)
processLines t = (doc, refmap)
where
(doc, refmap) = evalRWS (mapM_ processLine lns >> closeStack) () startState
lns = zip [1..] (map tabFilter $ T.lines t)
startState = ContainerStack (Container Document mempty) []
processLine :: (LineNumber, Text) -> ContainerM ()
processLine (lineNumber, txt) = do
ContainerStack top@(Container ct cs) rest <- get
let (t', numUnmatched) = tryOpenContainers (reverse $ top:rest) txt
let lastLineIsText = numUnmatched == 0 &&
case viewr cs of
(_ :> L _ (TextLine _)) -> True
_ -> False
case ct of
RawHtmlBlock{} | numUnmatched == 0 -> addLeaf lineNumber (TextLine t')
IndentedCode | numUnmatched == 0 -> addLeaf lineNumber (TextLine t')
FencedCode{ fence = fence' } ->
if fence' `T.isPrefixOf` t'
then closeContainer
else addLeaf lineNumber (TextLine t')
_ -> case tryNewContainers lastLineIsText (T.length txt - T.length t') t' of
([], TextLine t)
| numUnmatched > 0
, case viewr cs of
(_ :> L _ (TextLine _)) -> True
_ -> False
, ct /= IndentedCode -> addLeaf lineNumber (TextLine t)
([], SetextHeader lev _) | numUnmatched == 0 ->
case viewr cs of
(cs' :> L _ (TextLine t)) ->
put $ ContainerStack (Container ct
(cs' |> L lineNumber (SetextHeader lev t))) rest
_ -> error "setext header line without preceding text line"
(ns, lf) -> do
replicateM numUnmatched closeContainer
mapM_ addContainer ns
case (reverse ns, lf) of
(FencedCode{}:_, BlankLine{}) -> return ()
_ -> addLeaf lineNumber lf
tryOpenContainers :: [Container] -> Text -> (Text, Int)
tryOpenContainers cs t = case parse (scanners $ map containerContinue cs) t of
Right (t', n) -> (t', n)
Left e -> error $ "error parsing scanners: " ++
show e
where scanners [] = (,) <$> takeText <*> pure 0
scanners (p:ps) = (p *> scanners ps)
<|> ((,) <$> takeText <*> pure (length (p:ps)))
tryNewContainers :: Bool -> Int -> Text -> ([ContainerType], Leaf)
tryNewContainers lastLineIsText offset t =
case parse newContainers t of
Right (cs,t') -> (cs, t')
Left err -> error (show err)
where newContainers = do
getPosition >>= \pos -> setPosition pos{ column = offset + 1 }
regContainers <- many (containerStart lastLineIsText)
verbatimContainers <- option []
$ count 1 (verbatimContainerStart lastLineIsText)
if null verbatimContainers
then (,) <$> pure regContainers <*> leaf lastLineIsText
else (,) <$> pure (regContainers ++ verbatimContainers) <*>
textLineOrBlank
textLineOrBlank :: Parser Leaf
textLineOrBlank = consolidate <$> takeText
where consolidate ts | T.all (==' ') ts = BlankLine ts
| otherwise = TextLine ts
leaf :: Bool -> Parser Leaf
leaf lastLineIsText = scanNonindentSpace *> (
(ATXHeader <$> parseAtxHeaderStart <*>
(T.strip . removeATXSuffix <$> takeText))
<|> (guard lastLineIsText *> (SetextHeader <$> parseSetextHeaderLine <*> pure mempty))
<|> (Rule <$ scanHRuleLine)
<|> textLineOrBlank
)
where removeATXSuffix t = case T.dropWhileEnd (`elem` (" #" :: String)) t of
t' | T.null t' -> t'
| T.last t' == '\\' -> t' <> "#"
| otherwise -> t'
scanReference :: Scanner
scanReference = () <$ lookAhead (pLinkLabel >> scanChar ':')
scanBlockquoteStart :: Scanner
scanBlockquoteStart = scanChar '>' >> option () (scanChar ' ')
parseAtxHeaderStart :: Parser Int
parseAtxHeaderStart = do
char '#'
hashes <- upToCountChars 5 (== '#')
notFollowedBy (skip (/= ' '))
return $ T.length hashes + 1
parseSetextHeaderLine :: Parser Int
parseSetextHeaderLine = do
d <- satisfy (\c -> c == '-' || c == '=')
let lev = if d == '=' then 1 else 2
skipWhile (== d)
scanBlankline
return lev
scanHRuleLine :: Scanner
scanHRuleLine = do
c <- satisfy (\c -> c == '*' || c == '_' || c == '-')
count 2 $ scanSpaces >> skip (== c)
skipWhile (\x -> x == ' ' || x == c)
endOfInput
parseCodeFence :: Parser ContainerType
parseCodeFence = do
col <- column <$> getPosition
cs <- takeWhile1 (=='`') <|> takeWhile1 (=='~')
guard $ T.length cs >= 3
scanSpaces
rawattr <- takeWhile (\c -> c /= '`' && c /= '~')
endOfInput
return $ FencedCode { startColumn = col
, fence = cs
, info = rawattr }
parseHtmlBlockStart :: Parser ()
parseHtmlBlockStart = () <$ lookAhead
((do t <- pHtmlTag
guard $ f $ fst t
return $ snd t)
<|> string "<!--"
<|> string "-->"
)
where f (Opening name) = name `Set.member` blockHtmlTags
f (SelfClosing name) = name `Set.member` blockHtmlTags
f (Closing name) = name `Set.member` blockHtmlTags
blockHtmlTags :: Set.Set Text
blockHtmlTags = Set.fromList
[ "article", "header", "aside", "hgroup", "blockquote", "hr",
"body", "li", "br", "map", "button", "object", "canvas", "ol",
"caption", "output", "col", "p", "colgroup", "pre", "dd",
"progress", "div", "section", "dl", "table", "dt", "tbody",
"embed", "textarea", "fieldset", "tfoot", "figcaption", "th",
"figure", "thead", "footer", "footer", "tr", "form", "ul",
"h1", "h2", "h3", "h4", "h5", "h6", "video"]
parseListMarker :: Parser ContainerType
parseListMarker = do
col <- column <$> getPosition
ty <- parseBullet <|> parseListNumber
padding' <- (1 <$ scanBlankline)
<|> (1 <$ (skip (==' ') *> lookAhead (count 4 (char ' '))))
<|> (T.length <$> takeWhile (==' '))
guard $ padding' > 0
return $ ListItem { listType = ty
, markerColumn = col
, padding = padding' + listMarkerWidth ty
}
listMarkerWidth :: ListType -> Int
listMarkerWidth (Bullet _) = 1
listMarkerWidth (Numbered _ n) | n < 10 = 2
| n < 100 = 3
| n < 1000 = 4
| otherwise = 5
parseBullet :: Parser ListType
parseBullet = do
c <- satisfy (\c -> c == '+' || c == '*' || c == '-')
unless (c == '+')
$ nfb $ (count 2 $ scanSpaces >> skip (== c)) >>
skipWhile (\x -> x == ' ' || x == c) >> endOfInput
return $ Bullet c
parseListNumber :: Parser ListType
parseListNumber = do
num <- (read . T.unpack) <$> takeWhile1 isDigit
wrap <- PeriodFollowing <$ skip (== '.')
<|> ParenFollowing <$ skip (== ')')
return $ Numbered wrap num