{-# LANGUAGE DeriveFunctor #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TupleSections #-} {-# LANGUAGE ViewPatterns #-} module Text.Commonmark.Parser ( commonmarkToDoc ) where import Prelude hiding (takeWhile) import Control.Applicative import Control.Bool import Control.Monad import Control.Monad.Trans.RWS.Strict import Data.Char import Data.Either import Data.Foldable import Data.List (intercalate) import qualified Data.Map as M import Data.Maybe (mapMaybe) import Data.Monoid import Data.Sequence (Seq, ViewL(..), ViewR(..), singleton, viewl, viewr, (<|), (><), (|>)) import qualified Data.Sequence as Seq import qualified Data.Set as Set import Data.Text.Extended (Text) import qualified Data.Text.Extended as T import Text.Commonmark.Parser.Inline import Text.Commonmark.Parser.Options import Text.Commonmark.Parser.Util import Text.Commonmark.ParserCombinators import Text.Commonmark.Syntax import Text.Commonmark.Types -- | Parses Commonmark document. Any sequence of characters is a valid -- Commonmark document. -- -- At the moment no sanitizations are performed besides the ones defined -- in the spec. commonmarkToDoc :: [ParserOption] -> Text -> Doc Text commonmarkToDoc (parserOptions -> opts) text = Doc $ processDocument (cont, opts') where (cont, refmap) = processLines text opts' = opts { poLinkReferences = liftA2 (<|>) (poLinkReferences opts) (lookupLinkReference refmap) } -- General parsing strategy: -- -- Step 1: processLines -- -- We process the input line by line. Each line modifies the -- container stack, by adding a leaf to the current open container, -- sometimes after closing old containers and/or opening new ones. -- -- To open a container is to add it to the top of the container stack, -- so that new content will be added under this container. -- To close a container is to remove it from the container stack and -- make it a child of the container above it on the container stack. -- -- When all the input has been processed, we close all open containers -- except the root (Document) container. At this point we should also -- have a ReferenceMap containing any defined link references. -- -- Step 2: processDocument -- -- We then convert this container structure into an AST. This principally -- involves (a) gathering consecutive ListItem containers into lists, (b) -- gathering TextLine nodes that don't belong to verbatim containers into -- paragraphs, and (c) parsing the inline contents of non-verbatim TextLines. -------- -- Container stack definitions: data ContainerStack = ContainerStack Container {- top -} [Container] {- rest -} type LineNumber = Int data Elt = C Container | L LineNumber Leaf deriving (Show) data Container = Container { containerType :: ContainerType , children :: Seq Elt } data ContainerType = Document | BlockQuote | ListItem { padding :: Int , itemType :: ListType } | FencedCode { startColumn :: Int , fence :: Text , info :: Maybe Text } | IndentedCode | RawHtmlBlock Condition | Reference deriving (Show, Eq) isIndentedCode :: Elt -> Bool isIndentedCode (C (Container IndentedCode _)) = True isIndentedCode _ = False isBlankLine :: Elt -> Bool isBlankLine (L _ BlankLine{}) = True isBlankLine _ = False isTextLine :: Elt -> Bool isTextLine (L _ (TextLine _)) = True isTextLine _ = False isListItem :: ContainerType -> Bool isListItem ListItem{} = True isListItem _ = False instance Show Container where show c = show (containerType c) ++ "\n" ++ nest 2 (intercalate "\n" (map pptElt $ toList $ children c)) nest :: Int -> String -> String nest num = intercalate "\n" . map (replicate num ' ' ++) . lines pptElt :: Elt -> String pptElt (C c) = show c pptElt (L _ (TextLine s)) = show s pptElt (L _ lf) = show lf -- Scanners that must be satisfied if the current open container is to be -- continued on a new line (ignoring lazy continuations). containerContinue :: Container -> Scanner containerContinue c = case containerType c of BlockQuote -> scanNonindentSpace *> scanBlockquoteStart IndentedCode -> scanIndentSpace FencedCode{..} -> scanSpacesUpToColumn startColumn ListItem{..} -> scanBlankline <|> (tabCrusher *> replicateM_ padding (char ' ')) -- TODO: This is likely to be incorrect behaviour. Check. Reference -> nfb (scanBlankline <|> (scanNonindentSpace *> (scanReference <|> scanBlockquoteStart <|> scanTBreakLine)) <|> (() <$ parseAtxHeadingStart)) _ -> pure () {-# INLINE containerContinue #-} -- Defines parsers that open new containers. containerStart :: Bool -> Bool -> Parser ContainerType containerStart afterListItem lastLineIsText = choice [ scanNonindentSpace *> pure BlockQuote <* scanBlockquoteStart , parseListMarker afterListItem lastLineIsText ] -- Defines parsers that open new verbatim containers (containers -- that take only TextLine and BlankLine as children). verbatimContainerStart :: Bool -> Parser ContainerType verbatimContainerStart lastLineIsText = choice [ scanNonindentSpace *> parseCodeFence , do guard (not lastLineIsText) scanIndentSpace nfb scanBlankline pure IndentedCode , RawHtmlBlock <$> pHtmlBlockStart lastLineIsText , guard (not lastLineIsText) *> scanNonindentSpace *> (Reference <$ scanReference) ] -- Leaves of the container structure (they don't take children). type Leaf = GenLeaf Text data GenLeaf t = TextLine t | BlankLine t | ATXHeading HeadingLevel t | SetextHeading HeadingLevel t | SetextToken HeadingLevel t | Rule deriving (Show, Functor) type ContainerM = RWS () ReferenceMap ContainerStack -- Close the whole container stack, leaving only the root Document container. closeStack :: ContainerM Container closeStack = get >>= \case ContainerStack top [] -> return top ContainerStack _ _ -> closeContainer >> closeStack -- Close the top container on the stack. If the container is a Reference -- container, attempt to parse the reference and update the reference map. -- If it is a list item container, move a final BlankLine outside the list -- item. closeContainer :: ContainerM () closeContainer = do ContainerStack top rest <- get case top of (Container Reference cs'') -> case parse ((,) <$> pReference <*> takeText) (T.strip $ T.joinLines $ map extractText $ toList textlines) of Right ((lab, lnk, tit), unconsumed) -> do tell (M.singleton (normalizeReference lab) (lnk, tit)) case rest of (Container ct' cs' : rs) | T.null unconsumed -> put $ ContainerStack (Container ct' (rest' <> cs' |> C top)) rs | otherwise -> put $ ContainerStack (Container ct' ((L (-1) (TextLine unconsumed) <| rest') >< (cs' |> C top))) rs [] -> return () Left _ -> case rest of (Container ct' cs' : rs) -> put $ ContainerStack (Container ct' (cs' <> cs'')) rs [] -> return () where (textlines, rest') = Seq.spanl isTextLine cs'' (Container li@ListItem{} cs'') -> case rest of -- move final BlankLine outside of list item (Container ct' cs' : rs) -> case viewr cs'' of (zs :> b) | isBlankLine b -> put $ ContainerStack (Container ct' els) rs where els | null zs = cs' |> C (Container li zs) | otherwise = cs' |> C (Container li zs) |> b _ -> put $ ContainerStack (Container ct' (cs' |> C top)) rs [] -> return () _ -> case rest of (Container ct' cs' : rs) -> put $ ContainerStack (Container ct' (cs' |> C top)) rs [] -> return () -- Add a leaf to the top container. addLeaf :: LineNumber -> Leaf -> ContainerM () addLeaf lineNum lf = do ContainerStack top rest <- get case (top, lf) of (Container ct@(ListItem{}) cs, BlankLine{}) | (firstLine :< _) <- viewl cs -- two blanks break out of list item: , isBlankLine firstLine -> closeContainer *> addLeaf lineNum lf | otherwise -> put $ ContainerStack (Container ct (cs |> L lineNum lf)) rest (Container ct cs, _) -> put $ ContainerStack (Container ct (cs |> L lineNum lf)) rest -- Add a container to the container stack. addContainer :: ContainerType -> ContainerM () addContainer ct = modify $ \(ContainerStack top rest) -> ContainerStack (Container ct mempty) (top:rest) -- Step 2 -- Convert Document container and reference map into an AST. processDocument :: (Container, ParserOptions) -> Blocks Text processDocument (Container Document cs, opts) = processElts opts (toList cs) processDocument _ = error "top level container is not Document" -- Turn the result of `processLines` into a proper AST. -- This requires grouping text lines into paragraphs -- and list items into lists, handling blank lines, -- parsing inline contents of texts and resolving referencess. processElts :: ParserOptions -> [Elt] -> Blocks Text processElts _ [] = mempty processElts opts (L _lineNumber lf : rest) = case lf of -- Gobble text lines and make them into a Para: TextLine t -> singleton (Para $ parseInlines opts txt) <> processElts opts rest' where txt = T.stripEnd $ T.joinLines $ map T.stripStart $ t : map extractText textlines (textlines, rest') = span isTextLine rest -- Blanks at outer level are ignored: BlankLine{} -> processElts opts rest -- Headings: ATXHeading lvl t -> Heading lvl (parseInlines opts t) <| processElts opts rest SetextHeading lvl t -> Heading lvl (parseInlines opts t) <| processElts opts rest SetextToken _ _ -> error "Setext token wasn't converted to setext header" -- Horizontal rule: Rule -> ThematicBreak <| processElts opts rest processElts opts (C (Container ct cs) : rest) = case ct of Document -> error "Document container found inside Document" BlockQuote -> Quote (processElts opts (toList cs)) <| processElts opts rest -- List item? Gobble up following list items of the same type -- (skipping blank lines), determine whether the list is tight or -- loose, and generate a List. ListItem { itemType = itemType } -> List itemType isTight (Seq.fromList items') <| processElts opts rest' where xs = takeListItems rest rest' = drop (length xs) rest -- take list items as long as list type matches and we -- don't hit two blank lines: takeListItems (c@(C (Container ListItem { itemType = lt } _)) : zs) | listTypesMatch lt itemType = c : takeListItems zs takeListItems (lf@(L _ (BlankLine _)) : c@(C (Container ListItem { itemType = lt } _)) : zs) | listTypesMatch lt itemType = lf : c : takeListItems zs takeListItems _ = [] listTypesMatch (Bullet c1) (Bullet c2) = c1 == c2 listTypesMatch (Ordered w1 _) (Ordered w2 _) = w1 == w2 listTypesMatch _ _ = False items = mapMaybe getItem (Container ct cs : [c | C c <- xs]) getItem (Container ListItem{} cs') = Just $ toList cs' getItem _ = Nothing items' = map (processElts opts) items isTight = not (any isBlankLine xs) && all tightListItem items tightListItem [] = True tightListItem [_] = True tightListItem (_:is) = not $ any isBlankLine $ init is FencedCode _ _ info -> CodeBlock (parseInfoString <$> info) (T.unlines $ map extractText $ toList cs) <| processElts opts rest IndentedCode -> CodeBlock Nothing txt <| processElts opts rest' where txt = T.unlines $ stripTrailingEmpties $ concatMap extractCode cbs stripTrailingEmpties = reverse . dropWhile (T.all (== ' ')) . reverse -- explanation for next line: when we parsed -- the blank line, we dropped 0-3 spaces. -- but for this, code block context, we want -- to have dropped 4 spaces. we simply drop -- one more: extractCode (L _ (BlankLine t)) = [T.drop 1 t] extractCode (C (Container IndentedCode cs')) = map extractText $ toList cs' extractCode _ = [] (cbs, rest') = span (isIndentedCode <||> isBlankLine) (C (Container ct cs) : rest) RawHtmlBlock _ -> HtmlBlock txt <| processElts opts rest where txt = T.unlines (map extractText (toList cs)) -- References have already been taken into account in the reference map, -- so we just skip. Reference -> processElts opts rest extractText :: Elt -> Text extractText (L _ (TextLine t)) = t extractText _ = mempty -- Step 1 processLines :: Text -> (Container, ReferenceMap) processLines t = evalRWS (mapM_ processLine lns >> closeStack) () initState where lns = zip [1..] $ T.lines' $ T.replace "\0" "\xFFFD" t initState = ContainerStack (Container Document mempty) [] -- The main block-parsing function. -- We analyze a line of text and modify the container stack accordingly, -- adding a new leaf, or closing or opening containers. processLine :: (LineNumber, Text) -> ContainerM () processLine (lineNumber, txt) = do ContainerStack top@(Container ct cs) rest <- get -- Apply the line-start scanners appropriate for each nested container. -- Return the remainder of the string, and the number of unmatched -- containers. let (t', numUnmatched) = tryOpenContainers (reverse $ top:rest) txt -- Some new containers can be started only after a blank. let lastLineIsText = numUnmatched == 0 && case viewr cs of (_ :> L _ (TextLine _)) -> True _ -> False -- Process the rest of the line in a way that makes sense given -- the container type at the top of the stack (ct): case ct of -- If it's a verbatim line container, add the line. RawHtmlBlock c | numUnmatched == 0 -> do addLeaf lineNumber (TextLine t') when (isRight $ parse (blockEnd c) t') $ closeContainer IndentedCode | numUnmatched == 0 -> addLeaf lineNumber (TextLine t') FencedCode{ fence = fence' } | numUnmatched == 0 -> if isRight $ parse scanClosing t' -- closing code fence then closeContainer else addLeaf lineNumber (TextLine t') where scanClosing = upToCountChars 3 (== ' ') >> string fence' >> scanChars (T.head fence') >> scanSpaces >> endOfInput -- otherwise, parse the remainder to see if we have new container starts: _ -> case tryNewContainers (isListItem ct) lastLineIsText (T.length txt - T.length t') t' of -- lazy continuation: text line, last line was text, no new containers, -- some unmatched containers: ([], TextLine t) | numUnmatched > 0 , _ :> L _ TextLine{} <- viewr cs , ct /= IndentedCode -> addLeaf lineNumber (TextLine t) -- A special case: Lazy continuation of a list item looking like -- indented code e.g: -- " 1. Paragraph" -- " with two lines" (IndentedCode : _, TextLine t) | numUnmatched > 0 , _ :> L _ TextLine{} <- viewr cs , ListItem{} <- ct -> addLeaf lineNumber $ TextLine $ T.strip t -- A special case: Lazy continuaation of a quote looking like -- indented code e.g: -- "> foo" -- " - bar" (IndentedCode : _, TextLine t) | numUnmatched > 0 , _ :> L _ TextLine{} <- viewr cs , BlockQuote{} <- ct -> addLeaf lineNumber $ TextLine $ T.strip t -- if it's a setext header line and the top container has a textline -- as last child, add a setext header: ([], SetextToken lev _setextText) | numUnmatched == 0 -> case Seq.spanr isTextLine cs of (textlines, cs') -- gather all preceding textlines and put them in the header | not (Seq.null textlines) -> put $ ContainerStack (Container ct (cs' |> L lineNumber (SetextHeading lev (T.strip $ T.unlines $ fmap extractText $ toList textlines)))) rest -- Note: the following case should not occur, since -- we don't add a SetextHeading leaf unless lastLineIsText. | otherwise -> error "setext header line without preceding text lines" -- The end tag can occur on the same line as the start tag. (RawHtmlBlock condition : _, TextLine t) | Right () <- parse (blockEnd condition) t -> do closeContainer addContainer (RawHtmlBlock condition) addLeaf lineNumber (TextLine t) closeContainer -- otherwise, close all the unmatched containers, add the new -- containers, and finally add the new leaf: (ns, lf) -> do -- close unmatched containers, add new ones _ <- replicateM numUnmatched closeContainer _ <- mapM_ addContainer ns case (reverse ns, lf) of -- don't add extra blank at beginning of fenced code block (FencedCode{}:_, BlankLine{}) -> return () _ -> addLeaf lineNumber lf tabCrusher :: Parser () tabCrusher = do p <- getPosition replacing (go (column p - 1) "") where go cnt acc = do c <- peekChar case c of Just ' ' -> char ' ' *> go (cnt + 1) (acc <> " ") Just '\t' -> char '\t' *> go (cnt + 4 - (cnt `mod` 4)) (acc <> T.replicate (4 - (cnt `mod` 4)) " ") _ -> pure acc -- Try to match the scanners corresponding to any currently open containers. -- Return remaining text after matching scanners, plus the number of open -- containers whose scanners did not match. (These will be closed unless -- we have a lazy text line.) tryOpenContainers :: [Container] -> Text -> (Text, Int) tryOpenContainers cs t = case parse (scanners $ map containerContinue cs) t of Right (t', n) -> (t', n) Left e -> error $ "error parsing scanners: " ++ show e where scanners [] = (,0) <$> takeText scanners (p:ps) = (p *> scanners ps) <|> ((,length (p:ps)) <$> takeText) -- Try to match parsers for new containers. Return list of new -- container types, and the leaf to add inside the new containers. tryNewContainers :: Bool -> Bool -> Int -> Text -> ([ContainerType], Leaf) tryNewContainers afterListItem lastLineIsText offset t = case parse newContainers t of Right (cs,t') -> (cs, t') Left err -> error (show err) where newContainers = do getPosition >>= \pos -> setPosition pos{ column = offset + 1 } regContainers <- many (containerStart afterListItem lastLineIsText) optional (verbatimContainerStart lastLineIsText) >>= \case Just verbatimContainer -- FIXME: Very inefficient append -> (regContainers ++ [verbatimContainer],) <$> textLineOrBlank Nothing -> (regContainers,) <$> leaf lastLineIsText textLineOrBlank :: Parser Leaf textLineOrBlank = consolidate <$> takeText where consolidate ts | T.all (==' ') ts = BlankLine ts | otherwise = TextLine ts -- Parse a leaf node. leaf :: Bool -> Parser Leaf leaf lastLineIsText = scanNonindentSpace *> choice [ ATXHeading <$> parseAtxHeadingStart <*> parseAtxHeadingContent , guard lastLineIsText *> parseSetextToken , Rule <$ scanTBreakLine , textLineOrBlank ] -- Scanners scanReference :: Scanner scanReference = void $ lookAhead (char '[') -- Scan the beginning of a blockquote: up to three spaces -- indent (outside of this scanner), the `>` character, and an optional space. scanBlockquoteStart :: Scanner scanBlockquoteStart = scanChar '>' *> tabCrusher *> discardOpt (scanChar ' ') -- Parse the sequence of `#` characters that begins an ATX -- header, and return the number of characters. We require -- a space after the initial string of `#`s, as not all markdown -- implementations do. This is because (a) the ATX reference -- implementation requires a space, and (b) since we're allowing -- headers without preceding blank lines, requiring the space -- avoids accidentally capturing a line like `#8 toggle bolt` as -- a header. parseAtxHeadingStart :: Parser HeadingLevel parseAtxHeadingStart = do _ <- char '#' hashes <- upToCountChars 5 (== '#') -- hashes must be followed by space unless empty header: notFollowedBy (skip ((/= ' ') <&&> (/= '\t'))) pure $ case (T.length hashes + 1) of 1 -> Heading1 2 -> Heading2 3 -> Heading3 4 -> Heading4 5 -> Heading5 6 -> Heading6 _ -> error $ "IMPOSSIBLE HAPPENED: parseAtxHeading parsed more than 6 characters " parseAtxHeadingContent :: Parser Text parseAtxHeadingContent = T.strip . removeATXSuffix <$> takeText where removeATXSuffix t = case dropTrailingHashes of t' | T.null t' -> t' -- an escaped \# | T.last t' == '\\' -> t' <> T.replicate trailingHashes "#" | T.last t' /= ' ' -> t | otherwise -> t' where dropTrailingSpaces = T.dropWhileEnd (== ' ') t dropTrailingHashes = T.dropWhileEnd (== '#') dropTrailingSpaces trailingHashes = T.length dropTrailingSpaces - T.length dropTrailingHashes parseSetextToken :: Parser Leaf parseSetextToken = fmap (uncurry SetextToken) $ withConsumed $ do d <- satisfy (\c -> c == '-' || c == '=') skipWhile (== d) scanBlankline return $ if d == '=' then Heading1 else Heading2 -- Scan a horizontal rule line: "...three or more hyphens, asterisks, -- or underscores on a line by themselves. If you wish, you may use -- spaces between the hyphens or asterisks." scanTBreakLine :: Scanner scanTBreakLine = do c <- satisfy ((== '*') <||> (== '_') <||> (== '-')) replicateM_ 2 $ skipWhile ((== ' ') <||> (== '\t')) *> skip (== c) skipWhile ((== ' ') <||> (== '\t') <||> (== c)) endOfInput -- Parse an initial code fence line, returning -- the fence part and the rest (after any spaces). parseCodeFence :: Parser ContainerType parseCodeFence = do col <- column <$> getPosition cs <- takeWhile1 (=='`') <|> takeWhile1 (=='~') guard $ T.length cs >= 3 scanSpaces rawattr <- optional (takeWhile1 (\c -> c /= '`' && c /= '~')) endOfInput return FencedCode { startColumn = col , fence = cs , info = rawattr } pHtmlBlockStart :: Bool -> Parser Condition pHtmlBlockStart lastLineIsText = lookAhead $ do discardOpt scanNonindentSpace choice starters where starters = [ condition1 <$ blockStart condition1 , condition2 <$ blockStart condition2 , condition3 <$ blockStart condition3 , condition4 <$ blockStart condition4 , condition5 <$ blockStart condition5 , condition6 <$ blockStart condition6 , condition7 <$ if lastLineIsText then mzero else blockStart condition7 ] data Condition = Condition { blockStart :: Parser () , blockEnd :: Parser () } instance Show Condition where show _ = "Condition{}" instance Eq Condition where _ == _ = False lineContains :: Foldable t => t Text -> Parser () lineContains terms = do line <- T.toCaseFold <$> takeTill ((== '\r') <||> (== '\n')) guard $ any (`T.isInfixOf` line) terms condition1, condition2, condition3, condition4, condition5, condition6, condition7 :: Condition condition1 = Condition { blockStart = do _ <- choice $ map stringCaseless [" void ">" <|> lineEnding <|> endOfInput , blockEnd = lineContains ["", "", ""] } condition2 = Condition { blockStart = void ""] } condition3 = Condition { blockStart = void ""] } condition4 = Condition { blockStart = void $ " satisfy isAsciiUpper , blockEnd = void $ lineContains [">"] } condition5 = Condition { blockStart = void $ ""] } condition6 = Condition { blockStart = do void $ " "<" tag <- takeTill (isWhitespace <||> (== '\n') <||> (== '\r') <||> (== '/') <||> (== '>')) guard $ isBlockHtmlTag (T.toLower tag) void pWhitespace <|> lineEnding <|> void ">" <|> void "/>" , blockEnd = scanBlankline } condition7 = Condition { blockStart = (openTag <|> closeTag) *> (void pWhitespace <|> endOfInput) , blockEnd = scanBlankline } where tagName = do c <- satisfy (inClass "A-Za-z") cs <- takeWhile ((== '-') <||> inClass "A-Za-z0-9") guard (T.cons c cs `notElem` ["script", "style", "pre"]) attr = skipWhitespace *> attrName *> optional attrValueSpec attrName = satisfy (inClass "_:A-Za-z") *> skipWhile (inClass "A-Za-z0-9_.:-") attrValueSpec = optional skipWhitespace *> char '=' *> optional skipWhitespace *> attrValue attrValue = void unquoted <|> void singleQuoted <|> void doubleQuoted unquoted = skipWhile1 (notInClass " \"'=<>`") singleQuoted = "'" *> skipWhile (/= '\'') *> "'" doubleQuoted = "\"" *> skipWhile (/= '"') *> "\"" openTag = "<" *> tagName *> many attr *> optional skipWhitespace *> optional "/" *> ">" closeTag = " tagName *> optional skipWhitespace *> ">" -- List of block level tags for HTML 5. isBlockHtmlTag :: Text -> Bool isBlockHtmlTag name = T.toLower name `Set.member` Set.fromList [ "address", "article", "aside", "base", "basefont", "blockquote" , "body", "caption", "center", "col", "colgroup", "dd", "details" , "dialog", "dir", "div", "dl", "dt", "fieldset", "figcaption" , "figure", "footer", "form", "frame", "frameset" , "h1", "h2", "h3", "h4", "h5", "h6", "head", "header" , "hr", "html", "iframe", "legend", "li", "link", "main" , "menu", "menuitem", "meta", "nav", "noframes", "ol", "optgroup" , "option", "p", "param", "section", "source", "summary", "table" , "tbody", "td", "tfoot", "th", "thead", "title", "tr", "track", "ul" ] -- Parse a list marker and return the list type. -- -- " 1. Content" -- ^^ ^^-- contentPadding -- \\-- markerPadding parseListMarker :: Bool -> Bool -> Parser ContainerType parseListMarker afterListItem lastLineIsText = do tabCrusher markerPadding <- if afterListItem then countSpaces else countNonindentSpace ty <- parseBullet <|> parseListNumber lastLineIsText -- padding is 1 if list marker followed by a blank line -- or indented code. otherwise it's the length of the -- whitespace between the list marker and the following text: tabCrusher contentPadding <- (1 <$ scanBlankline) <|> (1 <$ (skip (==' ') *> lookAhead scanIndentSpace)) <|> countSpaces -- text can't immediately follow the list marker: guard $ contentPadding > 0 -- an empty list item cannot interrupt a paragraph when lastLineIsText $ notFollowedBy endOfInput return ListItem { itemType = ty , padding = markerPadding + contentPadding + listMarkerWidth ty } listMarkerWidth :: ListType -> Int listMarkerWidth (Bullet _) = 1 listMarkerWidth (Ordered _ n) | n < 10 = 2 | n < 100 = 3 | n < 1000 = 4 | otherwise = 5 -- Parse a bullet and return list type. parseBullet :: Parser ListType parseBullet = do (bulletType, bulletChar) <- ((Plus,) <$> satisfy (== '+')) <|> ((Minus,) <$> satisfy (== '-')) <|> ((Asterisk,) <$> satisfy (== '*')) unless (bulletType == Plus) $ nfb (replicateM 2 (skipWhile ((== ' ') <||> (== '\t')) >> skip (== bulletChar)) >> skipWhile (\x -> x == '\t' || x == ' ' || x == bulletChar) >> endOfInput) -- hrule return $ Bullet bulletType -- Parse a list number marker and return list type. parseListNumber :: Bool -> Parser ListType parseListNumber lastLineIsText = do num :: Integer <- decimal when lastLineIsText $ guard $ num == 1 guard $ num < (10 ^ (9 :: Integer)) wrap <- choice [Period <$ scanChar '.', Paren <$ scanChar ')'] return $ Ordered wrap (fromInteger num)