{ ----------------------------------------------------------------------------- -- | -- Module : Language.Python.Version3.Parser.Lexer -- Copyright : (c) 2009 Bernie Pope -- License : BSD-style -- Maintainer : bjpop@csse.unimelb.edu.au -- Stability : experimental -- Portability : ghc -- -- Implementation of a lexer for Python version 3.x programs. Generated by -- alex. Edited by Curran McConnell to conform to PEP515. ----------------------------------------------------------------------------- module Language.Python.Version3.Parser.Lexer (initStartCodeStack, lexToken, endOfFileToken, lexCont) where import Language.Python.Common.Token import Language.Python.Common.ParserMonad hiding (location) import Language.Python.Common.SrcLocation import Language.Python.Common.LexerUtils import qualified Data.Map as Map } -- character sets $lf = \n -- line feed $cr = \r -- carriage return $eol_char = [$lf $cr] -- any end of line character $not_eol_char = ~$eol_char -- anything but an end of line character $white_char = [\ \n\r\f\v\t] $white_no_nl = $white_char # $eol_char $ident_letter = [a-zA-Z_] $digit = 0-9 $non_zero_digit = 1-9 $oct_digit = 0-7 $hex_digit = [$digit a-fA-F] $bin_digit = 0-1 $short_str_char = [^ \n \r ' \" \\] $long_str_char = [. \n] # [' \"] $short_byte_str_char = \0-\127 # [\n \r ' \" \\] $long_byte_str_char = \0-\127 # [' \"] $not_single_quote = [. \n] # ' $not_double_quote = [. \n] # \" -- macro definitions @exponent = (e | E) (\+ | \-)? $digit(_?$digit+)* @fraction = \. $digit(_?$digit+)* @int_part = $digit(_?$digit+)* @point_float = (@int_part? @fraction) | @int_part \. @exponent_float = (@int_part | @point_float) @exponent @float_number = @point_float | @exponent_float @eol_pattern = $lf | $cr $lf | $cr $lf @one_single_quote = ' $not_single_quote @two_single_quotes = '' $not_single_quote @one_double_quote = \" $not_double_quote @two_double_quotes = \"\" $not_double_quote @byte_str_prefix = b | B @raw_str_prefix = r | R @unicode_str_prefix = u | U @format_str_prefix = f | F @raw_byte_str_prefix = @byte_str_prefix @raw_str_prefix | @raw_str_prefix @byte_str_prefix @format_raw_str_prefix = @format_str_prefix @raw_str_prefix | @raw_str_prefix @format_str_prefix @backslash_pair = \\ (\\|'|\"|@eol_pattern|$short_str_char) @backslash_pair_bs = \\ (\\|'|\"|@eol_pattern|$short_byte_str_char) @short_str_item_single = $short_str_char|@backslash_pair|\" @short_str_item_double = $short_str_char|@backslash_pair|' @short_byte_str_item_single = $short_byte_str_char|@backslash_pair_bs|\" @short_byte_str_item_double = $short_byte_str_char|@backslash_pair_bs|' @long_str_item_single = $long_str_char|@backslash_pair|@one_single_quote|@two_single_quotes|\" @long_str_item_double = $long_str_char|@backslash_pair|@one_double_quote|@two_double_quotes|' @long_byte_str_item_single = $long_byte_str_char|@backslash_pair_bs|@one_single_quote|@two_single_quotes|\" @long_byte_str_item_double = $long_byte_str_char|@backslash_pair_bs|@one_double_quote|@two_double_quotes|' tokens :- -- these rules below could match inside a string literal, but they -- will not be applied because the rule for the literal will always -- match a longer sequence of characters. \# ($not_eol_char)* { token (\ span lit val -> CommentToken span lit) id } $white_no_nl+ ; -- skip whitespace -- \\ @eol_pattern ; -- line join -- \\ @eol_pattern { endOfLine lexToken } -- line join \\ @eol_pattern { lineJoin } -- line join <0> { @float_number { token FloatToken (readFloat.delUnderscores) } $non_zero_digit (_?$digit)* { token IntegerToken (read.delUnderscores) } (@float_number | @int_part) (j | J) { token ImaginaryToken (readFloat.init) } 0+(_?0+)* { token IntegerToken (read.delUnderscores) } 0 (o | O) (_?$oct_digit+)+ { token IntegerToken (read.delUnderscores) } 0 (x | X) (_?$hex_digit+)+ { token IntegerToken (read.delUnderscores) } 0 (b | B) (_?$bin_digit+)+ { token IntegerToken (readBinary.delUnderscores) } } -- String literals <0> { ' @short_str_item_single* ' { mkString stringToken } @raw_str_prefix ' @short_str_item_single* ' { mkString rawStringToken } @format_str_prefix ' @short_str_item_single* ' { mkString formatStringToken } @byte_str_prefix ' @short_byte_str_item_single* ' { mkString byteStringToken } @raw_byte_str_prefix ' @short_byte_str_item_single* ' { mkString rawByteStringToken } @format_raw_str_prefix ' @short_str_item_single* ' { mkString formatRawStringToken } @unicode_str_prefix ' @short_str_item_single* ' { mkString unicodeStringToken } \" @short_str_item_double* \" { mkString stringToken } @raw_str_prefix \" @short_str_item_double* \" { mkString rawStringToken } @format_str_prefix \" @short_str_item_double* \" { mkString formatStringToken } @byte_str_prefix \" @short_byte_str_item_double* \" { mkString byteStringToken } @raw_byte_str_prefix \" @short_byte_str_item_double* \" { mkString rawByteStringToken } @format_raw_str_prefix \" @short_str_item_double* \" { mkString formatRawStringToken } @unicode_str_prefix \" @short_str_item_double* \" { mkString unicodeStringToken } ''' @long_str_item_single* ''' { mkString stringToken } @raw_str_prefix ''' @long_str_item_single* ''' { mkString rawStringToken } @format_str_prefix ''' @long_str_item_single* ''' { mkString formatStringToken } @byte_str_prefix ''' @long_byte_str_item_single* ''' { mkString byteStringToken } @raw_byte_str_prefix ''' @long_byte_str_item_single* ''' { mkString rawByteStringToken } @format_raw_str_prefix ''' @long_str_item_single* ''' { mkString formatRawStringToken } @unicode_str_prefix ''' @long_str_item_single* ''' { mkString unicodeStringToken } \"\"\" @long_str_item_double* \"\"\" { mkString stringToken } @raw_str_prefix \"\"\" @long_str_item_double* \"\"\" { mkString rawStringToken } @format_str_prefix \"\"\" @long_str_item_double* \"\"\" { mkString formatStringToken } @byte_str_prefix \"\"\" @long_byte_str_item_double* \"\"\" { mkString byteStringToken } @raw_byte_str_prefix \"\"\" @long_byte_str_item_double* \"\"\" { mkString rawByteStringToken } @format_raw_str_prefix \"\"\" @long_str_item_double* \"\"\" { mkString formatRawStringToken } @unicode_str_prefix \"\"\" @long_str_item_double* \"\"\" { mkString unicodeStringToken } } -- NOTE: we pass lexToken into some functions as an argument. -- That allows us to define those functions in a separate module, -- which increases code reuse in the lexer (because that code can -- be shared between the lexer for versions 2 and 3 of Python. -- Unfortunately lexToken must be defined in this file because -- it refers to data types which are only included by Alex in -- the generated file (this seems like a limitation in Alex -- that should be improved). <0> { @eol_pattern { bolEndOfLine lexToken bol } } () { dedentation lexToken } -- beginning of line { @eol_pattern { endOfLine lexToken } () { indentation lexToken dedent BOL } } -- beginning of file { -- @eol_pattern ; @eol_pattern { endOfLine lexToken } () { indentation lexToken dedent BOF } } <0> $ident_letter($ident_letter|$digit)* { \loc len str -> keywordOrIdent (take len str) loc } -- operators and separators -- <0> { "(" { openParen LeftRoundBracketToken } ")" { closeParen RightRoundBracketToken } "[" { openParen LeftSquareBracketToken } "]" { closeParen RightSquareBracketToken } "{" { openParen LeftBraceToken } "}" { closeParen RightBraceToken } "->" { symbolToken RightArrowToken } "." { symbolToken DotToken } "..." { symbolToken EllipsisToken } "~" { symbolToken TildeToken } "+" { symbolToken PlusToken } "-" { symbolToken MinusToken } "**" { symbolToken ExponentToken } "*" { symbolToken MultToken } "/" { symbolToken DivToken } "//" { symbolToken FloorDivToken } "%" { symbolToken ModuloToken } "<<" { symbolToken ShiftLeftToken } ">>" { symbolToken ShiftRightToken } "<" { symbolToken LessThanToken } "<=" { symbolToken LessThanEqualsToken } ">" { symbolToken GreaterThanToken } ">=" { symbolToken GreaterThanEqualsToken } "==" { symbolToken EqualityToken } "!=" { symbolToken NotEqualsToken } "^" { symbolToken XorToken } "|" { symbolToken BinaryOrToken } "&&" { symbolToken AndToken } "&" { symbolToken BinaryAndToken } "||" { symbolToken OrToken } ":" { symbolToken ColonToken } "=" { symbolToken AssignToken } "+=" { symbolToken PlusAssignToken } "-=" { symbolToken MinusAssignToken } "*=" { symbolToken MultAssignToken } "/=" { symbolToken DivAssignToken } "%=" { symbolToken ModAssignToken } "**=" { symbolToken PowAssignToken } "&=" { symbolToken BinAndAssignToken } "|=" { symbolToken BinOrAssignToken } "^=" { symbolToken BinXorAssignToken } "<<=" { symbolToken LeftShiftAssignToken } ">>=" { symbolToken RightShiftAssignToken } "//=" { symbolToken FloorDivAssignToken } "@=" { symbolToken MatrixMultAssignToken } "," { symbolToken CommaToken } "@" { symbolToken AtToken } \; { symbolToken SemiColonToken } } { -- The lexer starts off in the beginning of file state (bof) initStartCodeStack :: [Int] initStartCodeStack = [bof,0] lexToken :: P Token lexToken = do location <- getLocation input <- getInput startCode <- getStartCode case alexScan (location, [], input) startCode of AlexEOF -> do -- Ensure there is a newline token before the EOF previousToken <- getLastToken case previousToken of NewlineToken {} -> do -- Ensure that there is sufficient dedent -- tokens for the outstanding indentation -- levels depth <- getIndentStackDepth if depth <= 1 then return endOfFileToken else do popIndent return dedentToken other -> do let insertedNewlineToken = NewlineToken $ mkSrcSpan location location setLastToken insertedNewlineToken return insertedNewlineToken AlexError _ -> lexicalError AlexSkip (nextLocation, _bs, rest) len -> do setLocation nextLocation setInput rest lexToken AlexToken (nextLocation, _bs, rest) len action -> do setLocation nextLocation setInput rest token <- action (mkSrcSpan location $ decColumn 1 nextLocation) len input setLastToken token return token -- This is called by the Happy parser. lexCont :: (Token -> P a) -> P a lexCont cont = do lexLoop where -- lexLoop :: P a lexLoop = do tok <- lexToken case tok of CommentToken {} -> do addComment tok lexLoop LineJoinToken {} -> lexLoop _other -> cont tok -- a keyword or an identifier (the syntax overlaps) keywordOrIdent :: String -> SrcSpan -> P Token keywordOrIdent str location = return $ case Map.lookup str keywords of Just symbol -> symbol location Nothing -> IdentifierToken location str -- mapping from strings to keywords keywords :: Map.Map String (SrcSpan -> Token) keywords = Map.fromList keywordNames keywordNames :: [(String, SrcSpan -> Token)] keywordNames = [ ("False", FalseToken), ("class", ClassToken), ("finally", FinallyToken), ("is", IsToken), ("return", ReturnToken) , ("None", NoneToken), ("continue", ContinueToken), ("for", ForToken), ("lambda", LambdaToken), ("try", TryToken) , ("True", TrueToken), ("def", DefToken), ("from", FromToken), ("nonlocal", NonLocalToken), ("while", WhileToken) , ("and", AndToken), ("del", DeleteToken), ("global", GlobalToken), ("not", NotToken), ("with", WithToken) , ("as", AsToken), ("elif", ElifToken), ("if", IfToken), ("or", OrToken), ("yield", YieldToken) , ("assert", AssertToken), ("else", ElseToken), ("import", ImportToken), ("pass", PassToken) , ("break", BreakToken), ("except", ExceptToken), ("in", InToken), ("raise", RaiseToken) , ("async", AsyncToken), ("await", AwaitToken) ] }