----------------------------------------------------------------------------- -- | -- Module : Text.Parsec.Token -- Copyright : (c) Daan Leijen 1999-2001, (c) Paolo Martini 2007 -- License : BSD-style (see the LICENSE file) -- -- Maintainer : derek.a.elkins@gmail.com -- Stability : provisional -- Portability : non-portable (uses local universal quantification: PolymorphicComponents) -- -- A helper module to parse lexical elements (tokens). See 'makeTokenParser' -- for a description of how to use it. -- ----------------------------------------------------------------------------- {-# LANGUAGE PolymorphicComponents #-} {-# OPTIONS_GHC -fno-warn-name-shadowing #-} module Text.Parsec.Token ( LanguageDef , GenLanguageDef (..) , TokenParser , GenTokenParser (..) , makeTokenParser ) where import Data.Char ( isAlpha, toLower, toUpper, isSpace, digitToInt ) import Data.List ( nub, sort ) import Control.Monad.Identity import Text.Parsec.Prim import Text.Parsec.Char import Text.Parsec.Combinator ----------------------------------------------------------- -- Language Definition ----------------------------------------------------------- type LanguageDef st = GenLanguageDef String st Identity -- | The @GenLanguageDef@ type is a record that contains all parameterizable -- features of the 'Text.Parsec.Token' module. The module 'Text.Parsec.Language' -- contains some default definitions. data GenLanguageDef s u m = LanguageDef { -- | Describes the start of a block comment. Use the empty string if the -- language doesn't support block comments. For example \"\/*\". commentStart :: String, -- | Describes the end of a block comment. Use the empty string if the -- language doesn't support block comments. For example \"*\/\". commentEnd :: String, -- | Describes the start of a line comment. Use the empty string if the -- language doesn't support line comments. For example \"\/\/\". commentLine :: String, -- | Set to 'True' if the language supports nested block comments. nestedComments :: Bool, -- | This parser should accept any start characters of identifiers. For -- example @letter \<|> char \"_\"@. identStart :: ParsecT s u m Char, -- | This parser should accept any legal tail characters of identifiers. -- For example @alphaNum \<|> char \"_\"@. identLetter :: ParsecT s u m Char, -- | This parser should accept any start characters of operators. For -- example @oneOf \":!#$%&*+.\/\<=>?\@\\\\^|-~\"@ opStart :: ParsecT s u m Char, -- | This parser should accept any legal tail characters of operators. -- Note that this parser should even be defined if the language doesn't -- support user-defined operators, or otherwise the 'reservedOp' -- parser won't work correctly. opLetter :: ParsecT s u m Char, -- | The list of reserved identifiers. reservedNames :: [String], -- | The list of reserved operators. reservedOpNames:: [String], -- | Set to 'True' if the language is case sensitive. caseSensitive :: Bool } ----------------------------------------------------------- -- A first class module: TokenParser ----------------------------------------------------------- type TokenParser st = GenTokenParser String st Identity -- | The type of the record that holds lexical parsers that work on -- @s@ streams with state @u@ over a monad @m@. data GenTokenParser s u m = TokenParser { -- | This lexeme parser parses a legal identifier. Returns the identifier -- string. This parser will fail on identifiers that are reserved -- words. Legal identifier (start) characters and reserved words are -- defined in the 'LanguageDef' that is passed to -- 'makeTokenParser'. An @identifier@ is treated as -- a single token using 'try'. identifier :: ParsecT s u m String, -- | The lexeme parser @reserved name@ parses @symbol -- name@, but it also checks that the @name@ is not a prefix of a -- valid identifier. A @reserved@ word is treated as a single token -- using 'try'. reserved :: String -> ParsecT s u m (), -- | This lexeme parser parses a legal operator. Returns the name of the -- operator. This parser will fail on any operators that are reserved -- operators. Legal operator (start) characters and reserved operators -- are defined in the 'LanguageDef' that is passed to -- 'makeTokenParser'. An @operator@ is treated as a -- single token using 'try'. operator :: ParsecT s u m String, -- |The lexeme parser @reservedOp name@ parses @symbol -- name@, but it also checks that the @name@ is not a prefix of a -- valid operator. A @reservedOp@ is treated as a single token using -- 'try'. reservedOp :: String -> ParsecT s u m (), -- | This lexeme parser parses a single literal character. Returns the -- literal character value. This parsers deals correctly with escape -- sequences. The literal character is parsed according to the grammar -- rules defined in the Haskell report (which matches most programming -- languages quite closely). charLiteral :: ParsecT s u m Char, -- | This lexeme parser parses a literal string. Returns the literal -- string value. This parsers deals correctly with escape sequences and -- gaps. The literal string is parsed according to the grammar rules -- defined in the Haskell report (which matches most programming -- languages quite closely). stringLiteral :: ParsecT s u m String, -- | This lexeme parser parses a natural number (a positive whole -- number). Returns the value of the number. The number can be -- specified in 'decimal', 'hexadecimal' or -- 'octal'. The number is parsed according to the grammar -- rules in the Haskell report. natural :: ParsecT s u m Integer, -- | This lexeme parser parses an integer (a whole number). This parser -- is like 'natural' except that it can be prefixed with -- sign (i.e. \'-\' or \'+\'). Returns the value of the number. The -- number can be specified in 'decimal', 'hexadecimal' -- or 'octal'. The number is parsed according -- to the grammar rules in the Haskell report. integer :: ParsecT s u m Integer, -- | This lexeme parser parses a floating point value. Returns the value -- of the number. The number is parsed according to the grammar rules -- defined in the Haskell report. float :: ParsecT s u m Double, -- | This lexeme parser parses either 'natural' or a 'float'. -- Returns the value of the number. This parsers deals with -- any overlap in the grammar rules for naturals and floats. The number -- is parsed according to the grammar rules defined in the Haskell report. naturalOrFloat :: ParsecT s u m (Either Integer Double), -- | Parses a positive whole number in the decimal system. Returns the -- value of the number. decimal :: ParsecT s u m Integer, -- | Parses a positive whole number in the hexadecimal system. The number -- should be prefixed with \"0x\" or \"0X\". Returns the value of the -- number. hexadecimal :: ParsecT s u m Integer, -- | Parses a positive whole number in the octal system. The number -- should be prefixed with \"0o\" or \"0O\". Returns the value of the -- number. octal :: ParsecT s u m Integer, -- | Lexeme parser @symbol s@ parses 'string' @s@ and skips -- trailing white space. symbol :: String -> ParsecT s u m String, -- | @lexeme p@ first applies parser @p@ and than the 'whiteSpace' -- parser, returning the value of @p@. Every lexical -- token (lexeme) is defined using @lexeme@, this way every parse -- starts at a point without white space. Parsers that use @lexeme@ are -- called /lexeme/ parsers in this document. -- -- The only point where the 'whiteSpace' parser should be -- called explicitly is the start of the main parser in order to skip -- any leading white space. -- -- > mainParser = do{ whiteSpace -- > ; ds <- many (lexeme digit) -- > ; eof -- > ; return (sum ds) -- > } lexeme :: forall a. ParsecT s u m a -> ParsecT s u m a, -- | Parses any white space. White space consists of /zero/ or more -- occurrences of a 'space', a line comment or a block (multi -- line) comment. Block comments may be nested. How comments are -- started and ended is defined in the 'LanguageDef' -- that is passed to 'makeTokenParser'. whiteSpace :: ParsecT s u m (), -- | Lexeme parser @parens p@ parses @p@ enclosed in parenthesis, -- returning the value of @p@. parens :: forall a. ParsecT s u m a -> ParsecT s u m a, -- | Lexeme parser @braces p@ parses @p@ enclosed in braces (\'{\' and -- \'}\'), returning the value of @p@. braces :: forall a. ParsecT s u m a -> ParsecT s u m a, -- | Lexeme parser @angles p@ parses @p@ enclosed in angle brackets (\'\<\' -- and \'>\'), returning the value of @p@. angles :: forall a. ParsecT s u m a -> ParsecT s u m a, -- | Lexeme parser @brackets p@ parses @p@ enclosed in brackets (\'[\' -- and \']\'), returning the value of @p@. brackets :: forall a. ParsecT s u m a -> ParsecT s u m a, -- | DEPRECATED: Use 'brackets'. squares :: forall a. ParsecT s u m a -> ParsecT s u m a, -- | Lexeme parser |semi| parses the character \';\' and skips any -- trailing white space. Returns the string \";\". semi :: ParsecT s u m String, -- | Lexeme parser @comma@ parses the character \',\' and skips any -- trailing white space. Returns the string \",\". comma :: ParsecT s u m String, -- | Lexeme parser @colon@ parses the character \':\' and skips any -- trailing white space. Returns the string \":\". colon :: ParsecT s u m String, -- | Lexeme parser @dot@ parses the character \'.\' and skips any -- trailing white space. Returns the string \".\". dot :: ParsecT s u m String, -- | Lexeme parser @semiSep p@ parses /zero/ or more occurrences of @p@ -- separated by 'semi'. Returns a list of values returned by -- @p@. semiSep :: forall a . ParsecT s u m a -> ParsecT s u m [a], -- | Lexeme parser @semiSep1 p@ parses /one/ or more occurrences of @p@ -- separated by 'semi'. Returns a list of values returned by @p@. semiSep1 :: forall a . ParsecT s u m a -> ParsecT s u m [a], -- | Lexeme parser @commaSep p@ parses /zero/ or more occurrences of -- @p@ separated by 'comma'. Returns a list of values returned -- by @p@. commaSep :: forall a . ParsecT s u m a -> ParsecT s u m [a], -- | Lexeme parser @commaSep1 p@ parses /one/ or more occurrences of -- @p@ separated by 'comma'. Returns a list of values returned -- by @p@. commaSep1 :: forall a . ParsecT s u m a -> ParsecT s u m [a] } ----------------------------------------------------------- -- Given a LanguageDef, create a token parser. ----------------------------------------------------------- -- | The expression @makeTokenParser language@ creates a 'GenTokenParser' -- record that contains lexical parsers that are -- defined using the definitions in the @language@ record. -- -- The use of this function is quite stylized - one imports the -- appropiate language definition and selects the lexical parsers that -- are needed from the resulting 'GenTokenParser'. -- -- > module Main where -- > -- > import Text.Parsec -- > import qualified Text.Parsec.Token as P -- > import Text.Parsec.Language (haskellDef) -- > -- > -- The parser -- > ... -- > -- > expr = parens expr -- > <|> identifier -- > <|> ... -- > -- > -- > -- The lexer -- > lexer = P.makeTokenParser haskellDef -- > -- > parens = P.parens lexer -- > braces = P.braces lexer -- > identifier = P.identifier lexer -- > reserved = P.reserved lexer -- > ... makeTokenParser :: (Stream s m Char) => GenLanguageDef s u m -> GenTokenParser s u m makeTokenParser languageDef = TokenParser{ identifier = identifier , reserved = reserved , operator = operator , reservedOp = reservedOp , charLiteral = charLiteral , stringLiteral = stringLiteral , natural = natural , integer = integer , float = float , naturalOrFloat = naturalOrFloat , decimal = decimal , hexadecimal = hexadecimal , octal = octal , symbol = symbol , lexeme = lexeme , whiteSpace = whiteSpace , parens = parens , braces = braces , angles = angles , brackets = brackets , squares = brackets , semi = semi , comma = comma , colon = colon , dot = dot , semiSep = semiSep , semiSep1 = semiSep1 , commaSep = commaSep , commaSep1 = commaSep1 } where ----------------------------------------------------------- -- Bracketing ----------------------------------------------------------- parens p = between (symbol "(") (symbol ")") p braces p = between (symbol "{") (symbol "}") p angles p = between (symbol "<") (symbol ">") p brackets p = between (symbol "[") (symbol "]") p semi = symbol ";" comma = symbol "," dot = symbol "." colon = symbol ":" commaSep p = sepBy p comma semiSep p = sepBy p semi commaSep1 p = sepBy1 p comma semiSep1 p = sepBy1 p semi ----------------------------------------------------------- -- Chars & Strings ----------------------------------------------------------- charLiteral = lexeme (between (char '\'') (char '\'' <?> "end of character") characterChar ) <?> "character" characterChar = charLetter <|> charEscape <?> "literal character" charEscape = do{ char '\\'; escapeCode } charLetter = satisfy (\c -> (c /= '\'') && (c /= '\\') && (c > '\026')) stringLiteral = lexeme ( do{ str <- between (char '"') (char '"' <?> "end of string") (many stringChar) ; return (foldr (maybe id (:)) "" str) } <?> "literal string") stringChar = do{ c <- stringLetter; return (Just c) } <|> stringEscape <?> "string character" stringLetter = satisfy (\c -> (c /= '"') && (c /= '\\') && (c > '\026')) stringEscape = do{ char '\\' ; do{ escapeGap ; return Nothing } <|> do{ escapeEmpty; return Nothing } <|> do{ esc <- escapeCode; return (Just esc) } } escapeEmpty = char '&' escapeGap = do{ many1 space ; char '\\' <?> "end of string gap" } -- escape codes escapeCode = charEsc <|> charNum <|> charAscii <|> charControl <?> "escape code" charControl = do{ char '^' ; code <- upper ; return (toEnum (fromEnum code - fromEnum 'A')) } charNum = do{ code <- decimal <|> do{ char 'o'; number 8 octDigit } <|> do{ char 'x'; number 16 hexDigit } ; return (toEnum (fromInteger code)) } charEsc = choice (map parseEsc escMap) where parseEsc (c,code) = do{ char c; return code } charAscii = choice (map parseAscii asciiMap) where parseAscii (asc,code) = try (do{ string asc; return code }) -- escape code tables escMap = zip ("abfnrtv\\\"\'") ("\a\b\f\n\r\t\v\\\"\'") asciiMap = zip (ascii3codes ++ ascii2codes) (ascii3 ++ ascii2) ascii2codes = ["BS","HT","LF","VT","FF","CR","SO","SI","EM", "FS","GS","RS","US","SP"] ascii3codes = ["NUL","SOH","STX","ETX","EOT","ENQ","ACK","BEL", "DLE","DC1","DC2","DC3","DC4","NAK","SYN","ETB", "CAN","SUB","ESC","DEL"] ascii2 = ['\BS','\HT','\LF','\VT','\FF','\CR','\SO','\SI', '\EM','\FS','\GS','\RS','\US','\SP'] ascii3 = ['\NUL','\SOH','\STX','\ETX','\EOT','\ENQ','\ACK', '\BEL','\DLE','\DC1','\DC2','\DC3','\DC4','\NAK', '\SYN','\ETB','\CAN','\SUB','\ESC','\DEL'] ----------------------------------------------------------- -- Numbers ----------------------------------------------------------- naturalOrFloat = lexeme (natFloat) <?> "number" float = lexeme floating <?> "float" integer = lexeme int <?> "integer" natural = lexeme nat <?> "natural" -- floats floating = do{ n <- decimal ; fractExponent n } natFloat = do{ char '0' ; zeroNumFloat } <|> decimalFloat zeroNumFloat = do{ n <- hexadecimal <|> octal ; return (Left n) } <|> decimalFloat <|> fractFloat 0 <|> return (Left 0) decimalFloat = do{ n <- decimal ; option (Left n) (fractFloat n) } fractFloat n = do{ f <- fractExponent n ; return (Right f) } fractExponent n = do{ fract <- fraction ; expo <- option 1.0 exponent' ; return ((fromInteger n + fract)*expo) } <|> do{ expo <- exponent' ; return ((fromInteger n)*expo) } fraction = do{ char '.' ; digits <- many1 digit <?> "fraction" ; return (foldr op 0.0 digits) } <?> "fraction" where op d f = (f + fromIntegral (digitToInt d))/10.0 exponent' = do{ oneOf "eE" ; f <- sign ; e <- decimal <?> "exponent" ; return (power (f e)) } <?> "exponent" where power e | e < 0 = 1.0/power(-e) | otherwise = fromInteger (10^e) -- integers and naturals int = do{ f <- lexeme sign ; n <- nat ; return (f n) } sign = (char '-' >> return negate) <|> (char '+' >> return id) <|> return id nat = zeroNumber <|> decimal zeroNumber = do{ char '0' ; hexadecimal <|> octal <|> decimal <|> return 0 } <?> "" decimal = number 10 digit hexadecimal = do{ oneOf "xX"; number 16 hexDigit } octal = do{ oneOf "oO"; number 8 octDigit } number base baseDigit = do{ digits <- many1 baseDigit ; let n = foldl (\x d -> base*x + toInteger (digitToInt d)) 0 digits ; seq n (return n) } ----------------------------------------------------------- -- Operators & reserved ops ----------------------------------------------------------- reservedOp name = lexeme $ try $ do{ string name ; notFollowedBy (opLetter languageDef) <?> ("end of " ++ show name) } operator = lexeme $ try $ do{ name <- oper ; if (isReservedOp name) then unexpected ("reserved operator " ++ show name) else return name } oper = do{ c <- (opStart languageDef) ; cs <- many (opLetter languageDef) ; return (c:cs) } <?> "operator" isReservedOp name = isReserved (sort (reservedOpNames languageDef)) name ----------------------------------------------------------- -- Identifiers & Reserved words ----------------------------------------------------------- reserved name = lexeme $ try $ do{ caseString name ; notFollowedBy (identLetter languageDef) <?> ("end of " ++ show name) } caseString name | caseSensitive languageDef = string name | otherwise = do{ walk name; return name } where walk [] = return () walk (c:cs) = do{ caseChar c <?> msg; walk cs } caseChar c | isAlpha c = char (toLower c) <|> char (toUpper c) | otherwise = char c msg = show name identifier = lexeme $ try $ do{ name <- ident ; if (isReservedName name) then unexpected ("reserved word " ++ show name) else return name } ident = do{ c <- identStart languageDef ; cs <- many (identLetter languageDef) ; return (c:cs) } <?> "identifier" isReservedName name = isReserved theReservedNames caseName where caseName | caseSensitive languageDef = name | otherwise = map toLower name isReserved names name = scan names where scan [] = False scan (r:rs) = case (compare r name) of LT -> scan rs EQ -> True GT -> False theReservedNames | caseSensitive languageDef = sort reserved | otherwise = sort . map (map toLower) $ reserved where reserved = reservedNames languageDef ----------------------------------------------------------- -- White space & symbols ----------------------------------------------------------- symbol name = lexeme (string name) lexeme p = do{ x <- p; whiteSpace; return x } --whiteSpace whiteSpace | noLine && noMulti = skipMany (simpleSpace <?> "") | noLine = skipMany (simpleSpace <|> multiLineComment <?> "") | noMulti = skipMany (simpleSpace <|> oneLineComment <?> "") | otherwise = skipMany (simpleSpace <|> oneLineComment <|> multiLineComment <?> "") where noLine = null (commentLine languageDef) noMulti = null (commentStart languageDef) simpleSpace = skipMany1 (satisfy isSpace) oneLineComment = do{ try (string (commentLine languageDef)) ; skipMany (satisfy (/= '\n')) ; return () } multiLineComment = do { try (string (commentStart languageDef)) ; inComment } inComment | nestedComments languageDef = inCommentMulti | otherwise = inCommentSingle inCommentMulti = do{ try (string (commentEnd languageDef)) ; return () } <|> do{ multiLineComment ; inCommentMulti } <|> do{ skipMany1 (noneOf startEnd) ; inCommentMulti } <|> do{ oneOf startEnd ; inCommentMulti } <?> "end of comment" where startEnd = nub (commentEnd languageDef ++ commentStart languageDef) inCommentSingle = do{ try (string (commentEnd languageDef)); return () } <|> do{ skipMany1 (noneOf startEnd) ; inCommentSingle } <|> do{ oneOf startEnd ; inCommentSingle } <?> "end of comment" where startEnd = nub (commentEnd languageDef ++ commentStart languageDef)