{-# OPTIONS #-} ----------------------------------------------------------------------------- -- | -- Module : Language.Python.Common.LexerUtils -- Copyright : (c) 2009 Bernie Pope -- License : BSD-style -- Maintainer : bjpop@csse.unimelb.edu.au -- Stability : experimental -- Portability : ghc -- -- Various utilities to support the Python lexer. ----------------------------------------------------------------------------- module Language.Python.Common.LexerUtils where import Control.Monad (liftM) import Data.List (foldl') import Data.Word (Word8) import Language.Python.Common.Token as Token import Language.Python.Common.ParserMonad hiding (location) import Language.Python.Common.SrcLocation import Codec.Binary.UTF8.String as UTF8 (encode) type Byte = Word8 -- Beginning of. BOF = beginning of file, BOL = beginning of line data BO = BOF | BOL -- Functions for building tokens type StartCode = Int type Action = SrcSpan -> Int -> String -> P Token lineJoin :: Action lineJoin span _len _str = return $ LineJoinToken $ spanStartPoint span endOfLine :: P Token -> Action endOfLine lexToken span _len _str = do setLastEOL $ spanStartPoint span lexToken bolEndOfLine :: P Token -> Int -> Action bolEndOfLine lexToken bol span len inp = do pushStartCode bol endOfLine lexToken span len inp dedentation :: P Token -> Action dedentation lexToken span _len _str = do topIndent <- getIndent -- case compare (endCol span) topIndent of case compare (startCol span) topIndent of EQ -> do popStartCode lexToken LT -> do popIndent return dedentToken GT -> spanError span "indentation error" indentation :: P Token -> Int -> BO -> Action -- Check if we are at the EOF. If yes, we may need to generate a newline, -- in case we came here from BOL (but not BOF). indentation lexToken _dedentCode bo _loc _len [] = do popStartCode case bo of BOF -> lexToken BOL -> newlineToken indentation lexToken dedentCode bo span _len _str = do popStartCode parenDepth <- getParenStackDepth if parenDepth > 0 then lexToken else do topIndent <- getIndent -- case compare (endCol span) topIndent of case compare (startCol span) topIndent of EQ -> case bo of BOF -> lexToken BOL -> newlineToken LT -> do pushStartCode dedentCode newlineToken -- GT -> do pushIndent (endCol span) GT -> do pushIndent (startCol span) return indentToken where indentToken = IndentToken span symbolToken :: (SrcSpan -> Token) -> Action symbolToken mkToken location _ _ = return (mkToken location) token :: (SrcSpan -> String -> a -> Token) -> (String -> a) -> Action token mkToken read location len str = return $ mkToken location literal (read literal) where literal = take len str -- special tokens for the end of file and end of line endOfFileToken :: Token endOfFileToken = EOFToken SpanEmpty dedentToken = DedentToken SpanEmpty newlineToken :: P Token newlineToken = do loc <- getLastEOL return $ NewlineToken loc -- Test if we are at the end of the line or file atEOLorEOF :: a -> AlexInput -> Int -> AlexInput -> Bool atEOLorEOF _user _inputBeforeToken _tokenLength (_loc, _bs, inputAfterToken) = null inputAfterToken || nextChar == '\n' || nextChar == '\r' where nextChar = head inputAfterToken notEOF :: a -> AlexInput -> Int -> AlexInput -> Bool notEOF _user _inputBeforeToken _tokenLength (_loc, _bs, inputAfterToken) = not (null inputAfterToken) delUnderscores :: String -> String delUnderscores [] = [] delUnderscores ('_':xs) = delUnderscores xs delUnderscores (x :xs) = x : delUnderscores xs readBinary :: String -> Integer readBinary = toBinary . drop 2 where toBinary = foldl' acc 0 acc b '0' = 2 * b acc b '1' = 2 * b + 1 acc _ _ = error "Lexer ensures all digits passed to readBinary are 0 or 1." readFloat :: String -> Double readFloat str@('.':cs) = read ('0':readFloatRest str) readFloat str = read (readFloatRest str) readFloatRest :: String -> String readFloatRest [] = [] readFloatRest ['.'] = ".0" readFloatRest (c:cs) = c : readFloatRest cs mkString :: (SrcSpan -> String -> Token) -> Action mkString toToken loc len str = do return $ toToken loc (take len str) stringToken :: SrcSpan -> String -> Token stringToken = StringToken rawStringToken :: SrcSpan -> String -> Token rawStringToken = StringToken byteStringToken :: SrcSpan -> String -> Token byteStringToken = ByteStringToken formatStringToken :: SrcSpan -> String -> Token formatStringToken = StringToken formatRawStringToken :: SrcSpan -> String -> Token formatRawStringToken = StringToken unicodeStringToken :: SrcSpan -> String -> Token unicodeStringToken = UnicodeStringToken rawByteStringToken :: SrcSpan -> String -> Token rawByteStringToken = ByteStringToken openParen :: (SrcSpan -> Token) -> Action openParen mkToken loc _len _str = do let token = mkToken loc pushParen token return token closeParen :: (SrcSpan -> Token) -> Action closeParen mkToken loc _len _str = do let token = mkToken loc topParen <- getParen case topParen of Nothing -> spanError loc err1 Just open -> if matchParen open token then popParen >> return token else spanError loc err2 where -- XXX fix these error messages err1 = "Lexical error ! unmatched closing paren" err2 = "Lexical error ! unmatched closing paren" matchParen :: Token -> Token -> Bool matchParen (LeftRoundBracketToken {}) (RightRoundBracketToken {}) = True matchParen (LeftBraceToken {}) (RightBraceToken {}) = True matchParen (LeftSquareBracketToken {}) (RightSquareBracketToken {}) = True matchParen _ _ = False -- ----------------------------------------------------------------------------- -- Functionality required by Alex type AlexInput = (SrcLocation, -- current src location [Byte], -- byte buffer for next character String) -- input string alexInputPrevChar :: AlexInput -> Char alexInputPrevChar _ = error "alexInputPrevChar not used" -- byte buffer should be empty here alexGetChar :: AlexInput -> Maybe (Char, AlexInput) alexGetChar (loc, [], input) | null input = Nothing | otherwise = seq nextLoc (Just (nextChar, (nextLoc, [], rest))) where nextChar = head input rest = tail input nextLoc = moveChar nextChar loc alexGetChar (loc, _:_, _) = error "alexGetChar called with non-empty byte buffer" -- mapFst :: (a -> b) -> (a, c) -> (b, c) -- mapFst f (a, c) = (f a, c) alexGetByte :: AlexInput -> Maybe (Byte, AlexInput) -- alexGetByte = fmap (mapFst (fromIntegral . ord)) . alexGetChar alexGetByte (loc, b:bs, input) = Just (b, (loc, bs, input)) alexGetByte (loc, [], []) = Nothing alexGetByte (loc, [], nextChar:rest) = seq nextLoc (Just (byte, (nextLoc, restBytes, rest))) where nextLoc = moveChar nextChar loc byte:restBytes = UTF8.encode [nextChar] moveChar :: Char -> SrcLocation -> SrcLocation moveChar '\n' = incLine 1 moveChar '\t' = incTab moveChar '\r' = id moveChar _ = incColumn 1 lexicalError :: P a lexicalError = do location <- getLocation c <- liftM head getInput throwError $ UnexpectedChar c location readOctNoO :: String -> Integer readOctNoO (zero:rest) = read (zero:'O':rest) readOctNoO [] = error "Lexer ensures readOctNoO is never called on an empty string"