{-# LANGUAGE FlexibleContexts #-}
module Docvim.Parse ( parse
, rstrip
, strip
, unit
) where
import Control.Applicative ( (*>)
, (<$)
, (<$>)
, (<*)
, (<*>)
, liftA
, liftA2
)
import Data.Char (toUpper)
import Data.List (groupBy, intercalate)
import Docvim.AST
import System.Exit (exitFailure)
import System.IO (hPutStrLn, stderr)
-- TODO: custom error messages with >
import Text.Parsec ( (<|>)
, (>)
, ParseError
, choice
, digit
, lookAhead
, many
, many1
, manyTill
, notFollowedBy
, option
, optionMaybe
, optional
, parseTest
, satisfy
, sepBy
, sepBy1
, sepEndBy
, sepEndBy1
, skipMany
, skipMany1
, try
, unexpected
)
import Text.Parsec.String (Parser, parseFromFile)
import Text.Parsec.Combinator (eof)
import Text.ParserCombinators.Parsec.Char ( alphaNum
, anyChar
, char
, noneOf
, oneOf
, string
, upper
)
-- | Given a `description` like "fu[nction]", returns a parser that matches
-- "fu", "fun", "func", "funct", "functi", "functio" and "function".
--
-- Beware, may explode at runtime if passed an invalid `description`, due to the
-- use of `init`.
--
-- Requires the FlexibleContexts extension, for reasons that I don't yet fully
-- understand.
command description = try (string prefix >> remainder rest)
> prefix ++ rest
where prefix = takeWhile (/= '[') description
rest = init (snd (splitAt (1 + length prefix) description))
remainder [r] = optional (char r)
remainder (r:rs) = optional (char r >> remainder rs)
function = FunctionDeclaration
<$> (fu *> bang <* wsc)
<*> (name <* optional wsc)
<*> arguments
<*> (attributes <* optional wsc)
<*> (skippable *> many node <* (optional ws >> endfunction))
where
fu = command "fu[nction]"
name = choice [script, normal, autoloaded] <* optional wsc
script = liftA2 (++) (try $ string "s:") (many $ oneOf identifier)
normal = liftA2 (++) (many1 upper) (many $ oneOf identifier)
autoloaded = do
a <- many1 $ oneOf identifier
b <- string "#"
c <- sepBy1 (many1 $ oneOf identifier) (string "#")
return $ a ++ b ++ intercalate "#" c
identifier = ['a'..'z'] ++ ['A'..'Z'] ++ ['0'..'9'] ++ "_"
arguments = (char '(' >> optional wsc)
*> (ArgumentList <$> argument `sepBy` (char ',' >> optional wsc))
<* (optional wsc >> char ')' >> optional wsc)
argument = Argument <$> (string "..." <|> many1 alphaNum) <* optional wsc
attributes = choice [string "abort", string "range", string "dict"] `sepEndBy` wsc
-- Disambiguate `:endf[unction]` and `:endfo[r]`
endfunction = lookAhead (string "endf" >> notFollowedBy (string "o"))
>> command "endf[unction]"
<* eos
lStatement = lookAhead (char 'l')
>> choice [ try (lookAhead (string "lw")) >> lwindow
, try (lookAhead (string "let")) >> letStatement
, lexpr
]
lwindow = LwindowStatement <$> (lw *> height <* eos)
where
lw = command "l[window]"
height = optionMaybe (wsc *> number)
number = liftA read (many1 digit)
lexpr = LexprStatement
<$> (command "lex[pr]" *> bang <* wsc)
<*> restOfLine
-- "let" is a reserved word in Haskell, so we call this "letStatement" instead.
letStatement = LetStatement
<$> (string "let" >> wsc >> lhs)
<*> (optional wsc >> char '=' >> optional wsc *> rhs <* eos)
where
-- Kludge alert! Until we get a full expression parser, we use this crude
-- thing.
lhs = many1 $ noneOf "\"\n="
rhs = many1 $ noneOf "\n"
unlet = UnletStatement
<$> (unl *> bang <* wsc)
<*> word
<* eos
where
unl = command "unl[et]"
quote = string "\"" > "quote"
commentStart = quote <* (notFollowedBy quote >> optional ws)
docBlockStart = (string "\"\"" <* optional ws) > "\"\""
separator = Separator <$ (try (string "---") >> optional ws) > "wat"
fenced = fence >> newline >> Fenced <$> body
where
fence = try $ string "```" >> optional ws
body = do
lines <- manyTill line (try $ (commentStart <|> docBlockStart) >> optional ws >> fence)
let indent = foldr countLeadingSpaces infinity lines
return $ map (trimLeadingSpace indent) lines
where
-- Find minimum count of leading spaces.
countLeadingSpaces line = min (length (takeWhile (' ' ==) line))
trimLeadingSpace count = if count > 0
then drop count
else id
infinity = maxBound :: Int
line = (commentStart' <|> docBlockStart') >> restOfLine <* newline
commentStart' = quote <* notFollowedBy quote
docBlockStart' = string "\"\"" > "\"\""
blockquote = lookAhead (char '>')
>> Blockquote
<$> paragraph `sepBy1` blankLine
where
paragraph = Paragraph <$> body
body = do
first <- firstLine
rest <- many otherLine
-- Make every line end with whitespace.
let nodes = concatMap appendWhitespace (first:rest)
-- Collapse consecutive whitespace.
let compressed = compress nodes
-- Trim final whitespace.
return ( if last compressed == Whitespace
then init compressed
else compressed )
firstLine = char '>'
>> optional ws
>> many1 (choice [phrasing, whitespace])
otherLine = try $ newline
>> (commentStart <|> docBlockStart)
>> firstLine
blankLine = try $ newline
>> (commentStart <|> docBlockStart)
>> many1 (try $ char '>'
>> optional ws
>> newline
>> (commentStart <|> docBlockStart))
list = lookAhead (char '-' >> notFollowedBy (char '-'))
>> List
<$> listItem `sepBy1` separator
where
-- Yes, this is a bit hideous.
separator = try $ newline
>> (commentStart <|> docBlockStart)
>> optional ws
>> lookAhead (char '-')
listItem = lookAhead (char '-' >> notFollowedBy (char '-'))
>> ListItem
<$> body
where
body = do
first <- char '-' >> optional ws >> many1 (choice [phrasing, whitespace])
rest <- many otherLine
-- Make every line end with whitespace.
let nodes = concatMap appendWhitespace (first:rest)
-- Collapse consecutive whitespace.
let compressed = compress nodes
-- Trim final whitespace.
return ( if last compressed == Whitespace
then init compressed
else compressed )
otherLine = try $ newline
>> (commentStart <|> docBlockStart)
-- TODO ^ DRY this up?
>> optional ws
>> lookAhead (noneOf "-")
>> many1 (choice [phrasing, whitespace])
-- | Newline (and slurps up following horizontal whitespace as well).
newline = (char '\n' >> optional ws) <|> eof
newlines = many1 (char '\n' >> optional ws)
<|> (eof >> return [()])
-- | Whitespace (specifically, horizontal whitespace: spaces and tabs).
ws = many1 (oneOf " \t")
-- | Continuation-aware whitespace (\).
wsc = many1 $ choice [whitespace, continuation]
where
whitespace = oneOf " \t"
continuation = try $ char '\n' >> ws >> char '\\'
-- TODO: string literals; some nasty lookahead might be required
comment = try
$ quote
>> notFollowedBy quote
>> restOfLine
>> skipMany (char '\n' >> optional ws)
-- | Optional bang suffix for VimL commands.
bang = option False (True <$ char '!')
-- | End-of-statement.
-- TODO: see `:h :bar` for a list of commands which see | as an arg instead of a
-- command separator.
eos = optional ws >> choice [bar, ws', skipMany1 comment]
where
bar = char '|' >> optional wsc
ws' = newlines >> notFollowedBy wsc
node :: Parser Node
node = choice [ docBlock
, vimL
]
<* optional skippable
docBlock = lookAhead docBlockStart
>> (DocBlock <$> many1 blockElement)
<* trailingBlankCommentLines
where
blockElement = try $ start
>> skipMany emptyLines
*> choice [ annotation
, try subheading -- must come before heading
, heading
, linkTargets
, separator
, list
, blockquote
, fenced
, paragraph -- must come last
]
<* next
start = try docBlockStart <|> commentStart
emptyLines = try $ newline >> start
next = optional ws >> newline
trailingBlankCommentLines = skipMany $ start >> newline
paragraph = Paragraph <$> body
where
body = do
first <- firstLine
rest <- many otherLine
-- Make every line end with whitespace
let nodes = concatMap appendWhitespace (first:rest)
-- Collapse consecutive whitespace
let compressed = compress nodes
-- Trim final whitespace
return ( if last compressed == Whitespace
then init compressed
else compressed )
firstLine = many1 $ choice [phrasing, whitespace]
otherLine = try $ newline
>> (commentStart <|> docBlockStart)
>> optional ws
>> notFollowedBy special
>> firstLine
-- | Used in lookahead rules to make sure that we don't greedily consume special
-- tokens as if they were just phrasing content.
special :: Parser String
special = choice [ string "-" <* notFollowedBy (char '-')
, string ">"
, string "---"
, string "-" <* string "--"
, string "```"
, string "`" <* string "``"
, string "@"
, string "#"
]
phrasing = choice [ br
, link
, code
, plaintext
]
-- | Appends a Whitespace token to a list of nodes.
appendWhitespace :: [Node] -> [Node]
appendWhitespace xs = xs ++ [Whitespace]
-- | Compress whitespace.
-- Consecutive Whitespace tokens are replaced with a single token.
-- If a run of whitespace includes a BreakTag, the run is replaced with the
-- BreakTag.
compress :: [Node] -> [Node]
compress = map prioritizeBreakTag . group
where
group = groupBy fn
fn BreakTag Whitespace = True
fn Whitespace BreakTag = True
fn Whitespace Whitespace = True
fn _ _ = False
prioritizeBreakTag xs = if hasBreakTag xs
then BreakTag
else head xs
hasBreakTag = elem BreakTag
-- similar to "word"... might end up replacing "word" later on...
-- something more sophisticated here with satisfy?
plaintext = Plaintext <$> wordChars
where
wordChars = many1 $ choice [ try $ char '<' <* notFollowedBy (string' "br")
, noneOf " \n\t<|`"
]
-- | Case-insensitive char match.
--
-- Based on `caseChar` function in:
-- https://hackage.haskell.org/package/hsemail-1.3/docs/Text-ParserCombinators-Parsec-Rfc2234.html
char' c = satisfy $ \x -> toUpper x == toUpper c
-- | Case-insensitive string match.
--
-- Based on `caseString` function in:
-- https://hackage.haskell.org/package/hsemail-1.3/docs/Text-ParserCombinators-Parsec-Rfc2234.html
string' s = mapM_ char' s >> pure s > s
-- | Tokenized whitespace.
--
-- Most whitespace is insignificant and gets omitted from the AST, but
-- whitespace inside "phrasing content" is significant so is preserved (in
-- normalized form) in the AST.
whitespace = Whitespace <$ ws
br = BreakTag <$ (try htmlTag <|> try xhtmlTag) > "
"
where
htmlTag = string' "
"
xhtmlTag = string' "
> optional ws >> string "/>"
link = Link <$> (bar *> linkText <* bar)
where
bar = char '|'
linkText = many1 $ noneOf " \t\n|"
code = Code <$> (backtick *> codeText <* backtick)
where
backtick = char '`'
codeText = many $ noneOf "\n`"
-- TODO: record this in symbol table similar to
-- https://github.com/wincent/docvim/blob/js/src/SymbolVisitor.js
-- (probably want to make this a post-processing step?)
linkTargets = LinkTargets <$> many1 (star *> target <* (star >> optional ws))
where
star = char '*'
target = many1 $ noneOf " \t\n*"
vimL = choice [ block
, statement
]
block = choice [ function ]
statement = choice [ lStatement
, unlet
, genericStatement
]
-- | Generic VimL node parser to represent stuff that we haven't built out full parsing
-- for yet.
genericStatement = do
-- Make sure we never recognize `endfunction` as a generic statement. This is
-- necessary because we call `node` recursively inside `function` while
-- parsing the function body. We must stop `node` from consuming
-- `endfunction`, otherwise the `function` parse will fail to find it.
notFollowedBy endfunction
atoms <- sepEndBy1 word (optional wsc)
eos
return $ GenericStatement $ unwords atoms
-- | Remainder of the line up to but not including a newline.
-- Does not include any trailing whitespace.
restOfLine :: Parser String
restOfLine = do
rest <- many (noneOf "\n")
return $ rstrip rest
-- | Strip trailing and leading whitespace.
--
-- Not efficient, but chosen for readablility.
--
-- TODO: switch to Data.Text (http://stackoverflow.com/a/6270382/2103996) for
-- efficiency.
strip = lstrip . rstrip
-- | Strip leading (left) whitespace.
lstrip = dropWhile (`elem` " \n\t")
-- | Strip trailing (right) whitespace.
rstrip = reverse . lstrip . reverse
heading :: Parser Node
heading = char '#'
>> notFollowedBy (char '#')
>> optional ws
>> HeadingAnnotation <$> restOfLine
subheading :: Parser Node
subheading = string "##"
>> optional ws
>> SubheadingAnnotation <$> restOfLine
-- | Match a "word" of non-whitespace characters.
word = many1 (noneOf " \n\t")
-- TODO: only allow these after "" and " at start of line
annotation :: Parser Node
annotation = char '@' *> annotationName
where
annotationName =
choice [ try $ string "commands" >> pure CommandsAnnotation -- must come before function
, command
, string "dedent" >> pure DedentAnnotation
, try $ string "footer" >> pure FooterAnnotation -- must come before function
, try $ string "functions" >> pure FunctionsAnnotation -- must come before function
, function
, string "indent" >> pure IndentAnnotation
, try $ string "mappings" >> pure MappingsAnnotation -- must come before mapping
, mapping
, try $ string "options" >> pure OptionsAnnotation -- must come before option
, option
, plugin
]
command = string "command" >> ws >> CommandAnnotation <$> commandName <*> commandParameters
commandName = char ':' *> many1 alphaNum <* optional ws
commandParameters = optionMaybe $ many1 (noneOf "\n")
function = string "function" >> ws >> FunctionAnnotation <$> word <* optional ws
mapping = string "mapping" >> ws >> MappingAnnotation <$> mappingName
mappingName = word <* optional ws
option = string "option" >> ws >> OptionAnnotation <$> optionName <*> optionType <*> optionDefault
optionName = many1 (alphaNum <|> char ':') <* ws > "option name"
optionType = many1 alphaNum <* optional ws > "option type"
optionDefault = optionMaybe word > "option default value"
plugin = string "plugin" >> ws >> PluginAnnotation <$> pluginName <*> plugInDescription
pluginName = many1 alphaNum <* ws
plugInDescription = restOfLine
-- | Parses a translation unit (file contents) into an AST.
unit :: Parser Node
unit = Unit
<$> (skippable >> many node)
<* eof
skippable = many $ choice [ comment
, skipMany1 ws
, skipMany1 (char '\n')
]
parse :: String -> IO Node
parse fileName = parseFromFile unit fileName >>= either report return
where
report err = do
hPutStrLn stderr $ "Error: " ++ show err
exitFailure