module Text.Pandoc.Parsing ( (>>~),
anyLine,
many1Till,
notFollowedBy',
oneOfStrings,
spaceChar,
nonspaceChar,
skipSpaces,
blankline,
blanklines,
enclosed,
stringAnyCase,
parseFromString,
lineClump,
charsInBalanced,
romanNumeral,
emailAddress,
uri,
withHorizDisplacement,
withRaw,
nullBlock,
failIfStrict,
failUnlessLHS,
escaped,
characterReference,
anyOrderedListMarker,
orderedListMarker,
charRef,
tableWith,
gridTableWith,
readWith,
testStringWith,
ParserState (..),
defaultParserState,
HeaderType (..),
ParserContext (..),
QuoteContext (..),
NoteTable,
KeyTable,
Key,
toKey,
fromKey,
lookupKeySrc,
smartPunctuation,
macro,
applyMacros' )
where
import Text.Pandoc.Definition
import Text.Pandoc.Generic
import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn)
import Text.ParserCombinators.Parsec
import Data.Char ( toLower, toUpper, ord, isAscii, isAlphaNum, isDigit, isPunctuation )
import Data.List ( intercalate, transpose )
import Network.URI ( parseURI, URI (..), isAllowedInURI )
import Control.Monad ( join, liftM, guard )
import Text.Pandoc.Shared
import qualified Data.Map as M
import Text.TeXMath.Macros (applyMacros, Macro, parseMacroDefinitions)
import Text.HTML.TagSoup.Entity ( lookupEntity )
(>>~) :: (Monad m) => m a -> m b -> m a
a >>~ b = a >>= \x -> b >> return x
anyLine :: GenParser Char st [Char]
anyLine = manyTill anyChar newline
many1Till :: GenParser tok st a
-> GenParser tok st end
-> GenParser tok st [a]
many1Till p end = do
first <- p
rest <- manyTill p end
return (first:rest)
notFollowedBy' :: Show b => GenParser a st b -> GenParser a st ()
notFollowedBy' p = try $ join $ do a <- try p
return (unexpected (show a))
<|>
return (return ())
oneOfStrings :: [String] -> GenParser Char st String
oneOfStrings listOfStrings = choice $ map (try . string) listOfStrings
spaceChar :: CharParser st Char
spaceChar = satisfy $ \c -> c == ' ' || c == '\t'
nonspaceChar :: CharParser st Char
nonspaceChar = satisfy $ \x -> x /= '\t' && x /= '\n' && x /= ' ' && x /= '\r'
skipSpaces :: GenParser Char st ()
skipSpaces = skipMany spaceChar
blankline :: GenParser Char st Char
blankline = try $ skipSpaces >> newline
blanklines :: GenParser Char st [Char]
blanklines = many1 blankline
enclosed :: GenParser Char st t
-> GenParser Char st end
-> GenParser Char st a
-> GenParser Char st [a]
enclosed start end parser = try $
start >> notFollowedBy space >> many1Till parser end
stringAnyCase :: [Char] -> CharParser st String
stringAnyCase [] = string ""
stringAnyCase (x:xs) = do
firstChar <- char (toUpper x) <|> char (toLower x)
rest <- stringAnyCase xs
return (firstChar:rest)
parseFromString :: GenParser tok st a -> [tok] -> GenParser tok st a
parseFromString parser str = do
oldPos <- getPosition
oldInput <- getInput
setInput str
result <- parser
setInput oldInput
setPosition oldPos
return result
lineClump :: GenParser Char st String
lineClump = blanklines
<|> (many1 (notFollowedBy blankline >> anyLine) >>= return . unlines)
charsInBalanced :: Char -> Char -> GenParser Char st Char
-> GenParser Char st String
charsInBalanced open close parser = try $ do
char open
let isDelim c = c == open || c == close
raw <- many $ many1 (notFollowedBy (satisfy isDelim) >> parser)
<|> (do res <- charsInBalanced open close parser
return $ [open] ++ res ++ [close])
char close
return $ concat raw
lowercaseRomanDigits :: [Char]
lowercaseRomanDigits = ['i','v','x','l','c','d','m']
uppercaseRomanDigits :: [Char]
uppercaseRomanDigits = map toUpper lowercaseRomanDigits
romanNumeral :: Bool
-> GenParser Char st Int
romanNumeral upperCase = do
let romanDigits = if upperCase
then uppercaseRomanDigits
else lowercaseRomanDigits
lookAhead $ oneOf romanDigits
let [one, five, ten, fifty, hundred, fivehundred, thousand] =
map char romanDigits
thousands <- many thousand >>= (return . (1000 *) . length)
ninehundreds <- option 0 $ try $ hundred >> thousand >> return 900
fivehundreds <- many fivehundred >>= (return . (500 *) . length)
fourhundreds <- option 0 $ try $ hundred >> fivehundred >> return 400
hundreds <- many hundred >>= (return . (100 *) . length)
nineties <- option 0 $ try $ ten >> hundred >> return 90
fifties <- many fifty >>= (return . (50 *) . length)
forties <- option 0 $ try $ ten >> fifty >> return 40
tens <- many ten >>= (return . (10 *) . length)
nines <- option 0 $ try $ one >> ten >> return 9
fives <- many five >>= (return . (5 *) . length)
fours <- option 0 $ try $ one >> five >> return 4
ones <- many one >>= (return . length)
let total = thousands + ninehundreds + fivehundreds + fourhundreds +
hundreds + nineties + fifties + forties + tens + nines +
fives + fours + ones
if total == 0
then fail "not a roman numeral"
else return total
emailChar :: GenParser Char st Char
emailChar = alphaNum <|>
satisfy (\c -> c == '-' || c == '+' || c == '_' || c == '.')
domainChar :: GenParser Char st Char
domainChar = alphaNum <|> char '-'
domain :: GenParser Char st [Char]
domain = do
first <- many1 domainChar
dom <- many1 $ try (char '.' >> many1 domainChar )
return $ intercalate "." (first:dom)
emailAddress :: GenParser Char st (String, String)
emailAddress = try $ do
firstLetter <- alphaNum
restAddr <- many emailChar
let addr = firstLetter:restAddr
char '@'
dom <- domain
let full = addr ++ '@':dom
return (full, escapeURI $ "mailto:" ++ full)
uri :: GenParser Char st (String, String)
uri = try $ do
let protocols = [ "http:", "https:", "ftp:", "file:", "mailto:",
"news:", "telnet:" ]
lookAhead $ oneOfStrings protocols
let innerPunct = try $ satisfy isPunctuation >>~
notFollowedBy (newline <|> spaceChar)
let uriChar = innerPunct <|>
satisfy (\c -> not (isPunctuation c) &&
(not (isAscii c) || isAllowedInURI c))
let inParens = try $ do char '('
res <- many uriChar
char ')'
return $ '(' : res ++ ")"
str <- liftM concat $ many1 $ inParens <|> count 1 (innerPunct <|> uriChar)
case parseURI (escapeURI str) of
Just uri' -> if uriScheme uri' `elem` protocols
then return (str, show uri')
else fail "not a URI"
Nothing -> fail "not a URI"
withHorizDisplacement :: GenParser Char st a
-> GenParser Char st (a, Int)
withHorizDisplacement parser = do
pos1 <- getPosition
result <- parser
pos2 <- getPosition
return (result, sourceColumn pos2 sourceColumn pos1)
withRaw :: GenParser Char st a -> GenParser Char st (a, [Char])
withRaw parser = do
pos1 <- getPosition
inp <- getInput
result <- parser
pos2 <- getPosition
let (l1,c1) = (sourceLine pos1, sourceColumn pos1)
let (l2,c2) = (sourceLine pos2, sourceColumn pos2)
let inplines = take ((l2 l1) + 1) $ lines inp
let raw = case inplines of
[] -> error "raw: inplines is null"
[l] -> take (c2 c1) l
ls -> unlines (init ls) ++ take (c2 1) (last ls)
return (result, raw)
nullBlock :: GenParser Char st Block
nullBlock = anyChar >> return Null
failIfStrict :: GenParser a ParserState ()
failIfStrict = do
state <- getState
if stateStrict state then fail "strict mode" else return ()
failUnlessLHS :: GenParser tok ParserState ()
failUnlessLHS = getState >>= guard . stateLiterateHaskell
escaped :: GenParser Char st Char
-> GenParser Char st Char
escaped parser = try $ char '\\' >> parser
characterReference :: GenParser Char st Char
characterReference = try $ do
char '&'
ent <- many1Till nonspaceChar (char ';')
case lookupEntity ent of
Just c -> return c
Nothing -> fail "entity not found"
upperRoman :: GenParser Char st (ListNumberStyle, Int)
upperRoman = do
num <- romanNumeral True
return (UpperRoman, num)
lowerRoman :: GenParser Char st (ListNumberStyle, Int)
lowerRoman = do
num <- romanNumeral False
return (LowerRoman, num)
decimal :: GenParser Char st (ListNumberStyle, Int)
decimal = do
num <- many1 digit
return (Decimal, read num)
exampleNum :: GenParser Char ParserState (ListNumberStyle, Int)
exampleNum = do
char '@'
lab <- many (alphaNum <|> satisfy (\c -> c == '_' || c == '-'))
st <- getState
let num = stateNextExample st
let newlabels = if null lab
then stateExamples st
else M.insert lab num $ stateExamples st
updateState $ \s -> s{ stateNextExample = num + 1
, stateExamples = newlabels }
return (Example, num)
defaultNum :: GenParser Char st (ListNumberStyle, Int)
defaultNum = do
char '#'
return (DefaultStyle, 1)
lowerAlpha :: GenParser Char st (ListNumberStyle, Int)
lowerAlpha = do
ch <- oneOf ['a'..'z']
return (LowerAlpha, ord ch ord 'a' + 1)
upperAlpha :: GenParser Char st (ListNumberStyle, Int)
upperAlpha = do
ch <- oneOf ['A'..'Z']
return (UpperAlpha, ord ch ord 'A' + 1)
romanOne :: GenParser Char st (ListNumberStyle, Int)
romanOne = (char 'i' >> return (LowerRoman, 1)) <|>
(char 'I' >> return (UpperRoman, 1))
anyOrderedListMarker :: GenParser Char ParserState ListAttributes
anyOrderedListMarker = choice $
[delimParser numParser | delimParser <- [inPeriod, inOneParen, inTwoParens],
numParser <- [decimal, exampleNum, defaultNum, romanOne,
lowerAlpha, lowerRoman, upperAlpha, upperRoman]]
inPeriod :: GenParser Char st (ListNumberStyle, Int)
-> GenParser Char st ListAttributes
inPeriod num = try $ do
(style, start) <- num
char '.'
let delim = if style == DefaultStyle
then DefaultDelim
else Period
return (start, style, delim)
inOneParen :: GenParser Char st (ListNumberStyle, Int)
-> GenParser Char st ListAttributes
inOneParen num = try $ do
(style, start) <- num
char ')'
return (start, style, OneParen)
inTwoParens :: GenParser Char st (ListNumberStyle, Int)
-> GenParser Char st ListAttributes
inTwoParens num = try $ do
char '('
(style, start) <- num
char ')'
return (start, style, TwoParens)
orderedListMarker :: ListNumberStyle
-> ListNumberDelim
-> GenParser Char ParserState Int
orderedListMarker style delim = do
let num = defaultNum <|>
case style of
DefaultStyle -> decimal
Example -> exampleNum
Decimal -> decimal
UpperRoman -> upperRoman
LowerRoman -> lowerRoman
UpperAlpha -> upperAlpha
LowerAlpha -> lowerAlpha
let context = case delim of
DefaultDelim -> inPeriod
Period -> inPeriod
OneParen -> inOneParen
TwoParens -> inTwoParens
(start, _, _) <- context num
return start
charRef :: GenParser Char st Inline
charRef = do
c <- characterReference
return $ Str [c]
tableWith :: GenParser Char ParserState ([[Block]], [Alignment], [Int])
-> ([Int] -> GenParser Char ParserState [[Block]])
-> GenParser Char ParserState sep
-> GenParser Char ParserState end
-> GenParser Char ParserState [Inline]
-> GenParser Char ParserState Block
tableWith headerParser rowParser lineParser footerParser captionParser = try $ do
caption' <- option [] captionParser
(heads, aligns, indices) <- headerParser
lines' <- rowParser indices `sepEndBy` lineParser
footerParser
caption <- if null caption'
then option [] captionParser
else return caption'
state <- getState
let numColumns = stateColumns state
let widths = widthsFromIndices numColumns indices
return $ Table caption aligns widths heads lines'
widthsFromIndices :: Int
-> [Int]
-> [Double]
widthsFromIndices _ [] = []
widthsFromIndices numColumns' indices =
let numColumns = max numColumns' (if null indices then 0 else last indices)
lengths' = zipWith () indices (0:indices)
lengths = reverse $
case reverse lengths' of
[] -> []
[x] -> [x]
(x:y:zs) -> if x < y && y x <= 2
then y:y:zs
else x:y:zs
totLength = sum lengths
quotient = if totLength > numColumns
then fromIntegral totLength
else fromIntegral numColumns
fracs = map (\l -> (fromIntegral l) / quotient) lengths in
tail fracs
gridTableWith :: GenParser Char ParserState Block
-> GenParser Char ParserState [Inline]
-> Bool
-> GenParser Char ParserState Block
gridTableWith block tableCaption headless =
tableWith (gridTableHeader headless block) (gridTableRow block) (gridTableSep '-') gridTableFooter tableCaption
gridTableSplitLine :: [Int] -> String -> [String]
gridTableSplitLine indices line = map removeFinalBar $ tail $
splitStringByIndices (init indices) $ removeTrailingSpace line
gridPart :: Char -> GenParser Char st (Int, Int)
gridPart ch = do
dashes <- many1 (char ch)
char '+'
return (length dashes, length dashes + 1)
gridDashedLines :: Char -> GenParser Char st [(Int,Int)]
gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) >>~ blankline
removeFinalBar :: String -> String
removeFinalBar =
reverse . dropWhile (`elem` " \t") . dropWhile (=='|') . reverse
gridTableSep :: Char -> GenParser Char ParserState Char
gridTableSep ch = try $ gridDashedLines ch >> return '\n'
gridTableHeader :: Bool
-> GenParser Char ParserState Block
-> GenParser Char ParserState ([[Block]], [Alignment], [Int])
gridTableHeader headless block = try $ do
optional blanklines
dashes <- gridDashedLines '-'
rawContent <- if headless
then return $ repeat ""
else many1
(notFollowedBy (gridTableSep '=') >> char '|' >>
many1Till anyChar newline)
if headless
then return ()
else gridTableSep '=' >> return ()
let lines' = map snd dashes
let indices = scanl (+) 0 lines'
let aligns = replicate (length lines') AlignDefault
let rawHeads = if headless
then replicate (length dashes) ""
else map (intercalate " ") $ transpose
$ map (gridTableSplitLine indices) rawContent
heads <- mapM (parseFromString $ many block) $
map removeLeadingTrailingSpace rawHeads
return (heads, aligns, indices)
gridTableRawLine :: [Int] -> GenParser Char ParserState [String]
gridTableRawLine indices = do
char '|'
line <- many1Till anyChar newline
return (gridTableSplitLine indices line)
gridTableRow :: GenParser Char ParserState Block
-> [Int]
-> GenParser Char ParserState [[Block]]
gridTableRow block indices = do
colLines <- many1 (gridTableRawLine indices)
let cols = map ((++ "\n") . unlines . removeOneLeadingSpace) $
transpose colLines
mapM (liftM compactifyCell . parseFromString (many block)) cols
removeOneLeadingSpace :: [String] -> [String]
removeOneLeadingSpace xs =
if all startsWithSpace xs
then map (drop 1) xs
else xs
where startsWithSpace "" = True
startsWithSpace (y:_) = y == ' '
compactifyCell :: [Block] -> [Block]
compactifyCell bs = head $ compactify [bs]
gridTableFooter :: GenParser Char ParserState [Char]
gridTableFooter = blanklines
readWith :: GenParser t ParserState a
-> ParserState
-> [t]
-> a
readWith parser state input =
case runParser parser state "source" input of
Left err' -> error $ "\nError:\n" ++ show err'
Right result -> result
testStringWith :: (Show a) => GenParser Char ParserState a
-> String
-> IO ()
testStringWith parser str = UTF8.putStrLn $ show $
readWith parser defaultParserState str
data ParserState = ParserState
{ stateParseRaw :: Bool,
stateParserContext :: ParserContext,
stateQuoteContext :: QuoteContext,
stateMaxNestingLevel :: Int,
stateLastStrPos :: Maybe SourcePos,
stateKeys :: KeyTable,
stateCitations :: [String],
stateNotes :: NoteTable,
stateTabStop :: Int,
stateStandalone :: Bool,
stateTitle :: [Inline],
stateAuthors :: [[Inline]],
stateDate :: [Inline],
stateStrict :: Bool,
stateSmart :: Bool,
stateOldDashes :: Bool,
stateLiterateHaskell :: Bool,
stateColumns :: Int,
stateHeaderTable :: [HeaderType],
stateIndentedCodeClasses :: [String],
stateNextExample :: Int,
stateExamples :: M.Map String Int,
stateHasChapters :: Bool,
stateApplyMacros :: Bool,
stateMacros :: [Macro]
}
deriving Show
defaultParserState :: ParserState
defaultParserState =
ParserState { stateParseRaw = False,
stateParserContext = NullState,
stateQuoteContext = NoQuote,
stateMaxNestingLevel = 6,
stateLastStrPos = Nothing,
stateKeys = M.empty,
stateCitations = [],
stateNotes = [],
stateTabStop = 4,
stateStandalone = False,
stateTitle = [],
stateAuthors = [],
stateDate = [],
stateStrict = False,
stateSmart = False,
stateOldDashes = False,
stateLiterateHaskell = False,
stateColumns = 80,
stateHeaderTable = [],
stateIndentedCodeClasses = [],
stateNextExample = 1,
stateExamples = M.empty,
stateHasChapters = False,
stateApplyMacros = True,
stateMacros = []}
data HeaderType
= SingleHeader Char
| DoubleHeader Char
deriving (Eq, Show)
data ParserContext
= ListItemState
| NullState
deriving (Eq, Show)
data QuoteContext
= InSingleQuote
| InDoubleQuote
| NoQuote
deriving (Eq, Show)
type NoteTable = [(String, String)]
newtype Key = Key [Inline] deriving (Show, Read, Eq, Ord)
toKey :: [Inline] -> Key
toKey = Key . bottomUp lowercase
where lowercase :: Inline -> Inline
lowercase (Str xs) = Str (map toLower xs)
lowercase (Math t xs) = Math t (map toLower xs)
lowercase (Code attr xs) = Code attr (map toLower xs)
lowercase (RawInline f xs) = RawInline f (map toLower xs)
lowercase LineBreak = Space
lowercase x = x
fromKey :: Key -> [Inline]
fromKey (Key xs) = xs
type KeyTable = M.Map Key Target
lookupKeySrc :: KeyTable
-> Key
-> Maybe Target
lookupKeySrc table key = case M.lookup key table of
Nothing -> Nothing
Just src -> Just src
failUnlessSmart :: GenParser tok ParserState ()
failUnlessSmart = getState >>= guard . stateSmart
smartPunctuation :: GenParser Char ParserState Inline
-> GenParser Char ParserState Inline
smartPunctuation inlineParser = do
failUnlessSmart
choice [ quoted inlineParser, apostrophe, dash, ellipses ]
apostrophe :: GenParser Char ParserState Inline
apostrophe = (char '\'' <|> char '\8217') >> return (Str "\x2019")
quoted :: GenParser Char ParserState Inline
-> GenParser Char ParserState Inline
quoted inlineParser = doubleQuoted inlineParser <|> singleQuoted inlineParser
withQuoteContext :: QuoteContext
-> (GenParser Char ParserState Inline)
-> GenParser Char ParserState Inline
withQuoteContext context parser = do
oldState <- getState
let oldQuoteContext = stateQuoteContext oldState
setState oldState { stateQuoteContext = context }
result <- parser
newState <- getState
setState newState { stateQuoteContext = oldQuoteContext }
return result
singleQuoted :: GenParser Char ParserState Inline
-> GenParser Char ParserState Inline
singleQuoted inlineParser = try $ do
singleQuoteStart
withQuoteContext InSingleQuote $ many1Till inlineParser singleQuoteEnd >>=
return . Quoted SingleQuote . normalizeSpaces
doubleQuoted :: GenParser Char ParserState Inline
-> GenParser Char ParserState Inline
doubleQuoted inlineParser = try $ do
doubleQuoteStart
withQuoteContext InDoubleQuote $ do
contents <- manyTill inlineParser doubleQuoteEnd
return . Quoted DoubleQuote . normalizeSpaces $ contents
failIfInQuoteContext :: QuoteContext -> GenParser tok ParserState ()
failIfInQuoteContext context = do
st <- getState
if stateQuoteContext st == context
then fail "already inside quotes"
else return ()
charOrRef :: [Char] -> GenParser Char st Char
charOrRef cs =
oneOf cs <|> try (do c <- characterReference
guard (c `elem` cs)
return c)
singleQuoteStart :: GenParser Char ParserState ()
singleQuoteStart = do
failIfInQuoteContext InSingleQuote
pos <- getPosition
st <- getState
guard $ stateLastStrPos st /= Just pos
try $ do charOrRef "'\8216\145"
notFollowedBy (oneOf ")!],;:-? \t\n")
notFollowedBy (char '.') <|> lookAhead (string "..." >> return ())
notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >>
satisfy (not . isAlphaNum)))
return ()
singleQuoteEnd :: GenParser Char st ()
singleQuoteEnd = try $ do
charOrRef "'\8217\146"
notFollowedBy alphaNum
doubleQuoteStart :: GenParser Char ParserState ()
doubleQuoteStart = do
failIfInQuoteContext InDoubleQuote
try $ do charOrRef "\"\8220\147"
notFollowedBy (satisfy (\c -> c == ' ' || c == '\t' || c == '\n'))
doubleQuoteEnd :: GenParser Char st ()
doubleQuoteEnd = do
charOrRef "\"\8221\148"
return ()
ellipses :: GenParser Char st Inline
ellipses = do
try (charOrRef "\8230\133") <|> try (string "..." >> return '…')
return (Str "\8230")
dash :: GenParser Char ParserState Inline
dash = do
oldDashes <- stateOldDashes `fmap` getState
if oldDashes
then emDashOld <|> enDashOld
else Str `fmap` (hyphenDash <|> emDash <|> enDash)
hyphenDash :: GenParser Char st String
hyphenDash = do
try $ string "--"
option "\8211" (char '-' >> return "\8212")
emDash :: GenParser Char st String
emDash = do
try (charOrRef "\8212\151")
return "\8212"
enDash :: GenParser Char st String
enDash = do
try (charOrRef "\8212\151")
return "\8211"
enDashOld :: GenParser Char st Inline
enDashOld = do
try (charOrRef "\8211\150") <|>
try (char '-' >> lookAhead (satisfy isDigit) >> return '–')
return (Str "\8211")
emDashOld :: GenParser Char st Inline
emDashOld = do
try (charOrRef "\8212\151") <|> (try $ string "--" >> optional (char '-') >> return '-')
return (Str "\8212")
macro :: GenParser Char ParserState Block
macro = do
getState >>= guard . stateApplyMacros
inp <- getInput
case parseMacroDefinitions inp of
([], _) -> pzero
(ms, rest) -> do count (length inp length rest) anyChar
updateState $ \st ->
st { stateMacros = ms ++ stateMacros st }
return Null
applyMacros' :: String -> GenParser Char ParserState String
applyMacros' target = do
apply <- liftM stateApplyMacros getState
if apply
then do macros <- liftM stateMacros getState
return $ applyMacros macros target
else return target