{-# LANGUAGE CPP #-} {-# LANGUAGE OverloadedStrings #-} module Language.Lua.LexerUtils where import Control.DeepSeq (NFData(..)) import Data.Char (ord) import Data.Text (Text) import qualified Data.Text as Text import Data.Word (Word8) #if !MIN_VERSION_base(4,8,0) import Control.Applicative (Applicative(..)) #endif import Language.Lua.Token -- | Lua token with position information. data LexToken = LexToken { ltokToken :: Token , ltokRange :: SourceRange , ltokLexeme :: Text } deriving (Show,Eq) -- | Type of alex actions type Action = AlexInput {- ^ state at the start of the lexeme -} -> AlexInput {- ^ state at the end of the lexeme -} -> Int {- ^ number of characters in the lexeme -} -> Mode {- ^ lexer mode -} -> (Mode, [LexToken]) {- ^ updated mode, lexemes -} -- | The remaining input text annotated with the starting position data AlexInput = AlexInput { input_pos :: {-# UNPACK #-} !SourcePos , input_prev :: {-# UNPACK #-} !SourcePos , input_text :: {-# UNPACK #-} !Text } -- | Lexer mode data Mode = NormalMode | StringMode StringMode [SourceRange] AlexInput -- ^ string type, errors, input at start | CommentMode AlexInput -- ^ Single line comment. Input at beginning of comment | QuoteMode AlexInput -- input at beginning of long-quote Int -- delim length Bool -- is comment -- ^ start delimlen iscomment data StringMode = SingleQuote | DoubleQuote -- | This is called when we encounter the end of a line before seeing -- the closing character for a string. unterminatedString :: Action unterminatedString _inp1 inp2 _len mode = case mode of StringMode _strTy _errs inp0 -> ( NormalMode , [ longToken inp0 inp2 TokUntermString ] ) _ -> error "[bug] unterminatedString outside a string." -- | An unknown character in "normal mode" invalidChar :: Action invalidChar inp1 _ _ _ = ( NormalMode , [ LexToken { ltokToken = TokUnexpected , ltokRange = singleRange (input_pos inp1) , ltokLexeme = Text.take 1 (input_text inp1) } ] ) -- | A a bad escape withing a string invalidEsc :: Action invalidEsc inp1 inp2 _ mode = case mode of StringMode m errs inp0 -> (StringMode m (err : errs) inp0, []) where err = SourceRange { sourceFrom = input_pos inp1 , sourceTo = input_prev inp2 } _ -> error "[bug] invalidEsc outside a string." checkEOF :: Mode -> AlexInput -> [LexToken] checkEOF mode AlexInput { input_prev = end } = case mode of NormalMode {} -> [] CommentMode {} -> [] QuoteMode inp _ True -> ret TokUntermComment inp QuoteMode inp _ _ -> ret TokUntermString inp StringMode _ _ inp -> ret TokUntermString inp where ret t AlexInput { input_pos = start, input_text = rest } = [ LexToken { ltokToken = t , ltokRange = SourceRange { sourceFrom = start, sourceTo = end } , ltokLexeme = rest } ] -- | Start lexing a long-quoted string literal enterLongString :: Action enterLongString inp _ len _ = (QuoteMode inp len False, []) -- | Start lexing of a string literal enterString :: StringMode -> Action enterString sm inp _ _ _ = (StringMode sm [] inp, []) -- | Start lexing a long-quoted comment enterLongComment :: Action enterLongComment inp _ len _ = (QuoteMode inp (len - 2) True, []) -- | Start lexing a single-line comment enterComment :: Action enterComment inp _ _ _ = (CommentMode inp, []) -- | Construct a lexeme spanning multiple matches longToken :: AlexInput {- ^ input from the mode -} -> AlexInput {- ^ current input -} -> Token {- ^ token for lexeme -} -> LexToken longToken AlexInput { input_pos = start, input_text = text } AlexInput { input_prev = end } t = LexToken { ltokToken = t , ltokRange = SourceRange { sourceFrom = start, sourceTo = end } , ltokLexeme = Text.take lexLen text } where lexLen = 1 + sourcePosIndex end - sourcePosIndex start -- | The closing delimiter for long-quoted lexemes must be the same length as -- the opening delimiter. This predicate checks if the currently match -- delimiter is the right length. endStringPredicate :: Mode {- ^ lexer mode -} -> AlexInput {- ^ input stream before the token -} -> Int {- ^ length of the token -} -> AlexInput {- ^ input stream after the token -} -> Bool {- ^ is expected ending long-quote -} endStringPredicate mode _ len _ = case mode of QuoteMode _ startlen _ -> len == startlen _ -> False -- | Action called at the end of a lexer-sub mode. endMode :: Action endMode _ inp2 _ mode = (NormalMode, [lexeme]) where lexeme = case mode of StringMode _ err inp -> longToken inp inp2 $ if null err then TokSLit else TokUnexpected CommentMode inp -> longToken inp inp2 TokComment QuoteMode inp _ isComment -> longToken inp inp2 $ if isComment then TokComment else TokSLit NormalMode -> error "endMode: internal lexer error" -- | Simplest action emitting a lexeme for the current match tok :: Token -> Action tok token inp1 inp2 len mode = (mode, [t]) where t = LexToken { ltokToken = token , ltokRange = SourceRange { sourceFrom = input_pos inp1 , sourceTo = input_prev inp2 } , ltokLexeme = Text.take len (input_text inp1) } -- | Drop the first line of a Lua file when it starts with a '#' dropSpecialComment :: Text -> Text dropSpecialComment text | "#" `Text.isPrefixOf` text = Text.dropWhile (/='\n') text | otherwise = text -- Newline is preserved in order to ensure that line numbers stay correct -- | This function drops whitespace and comments from a list of lexemes -- in order to make it suitable for parsing. dropWhiteSpace :: [LexToken] -> [LexToken] dropWhiteSpace = filter (not . isWhite . ltokToken) where isWhite TokWhiteSpace = True isWhite TokComment = True isWhite _ = False -------------------------------------------------------------------------------- -- Positions and Ranges -- | The type of locations in a source file data SourcePos = SourcePos { sourcePosName :: String , sourcePosIndex, sourcePosLine, sourcePosColumn :: {-# UNPACK #-}!Int } deriving (Show,Eq) instance NFData SourcePos where rnf (SourcePos _ _ _ _) = () data SourceRange = SourceRange { sourceFrom :: !SourcePos, sourceTo :: !SourcePos } deriving (Show,Eq) instance NFData SourceRange where rnf (SourceRange _ _) = () singleRange :: SourcePos -> SourceRange singleRange p = SourceRange { sourceFrom = p, sourceTo = p } startPos :: String -> SourcePos startPos n = SourcePos n 0 1 1 showPos :: SourcePos -> String showPos p = show (sourcePosLine p) ++ ":" ++ show (sourcePosColumn p) showRange :: SourceRange -> String showRange p = showPos (sourceFrom p) ++ "--" ++ showPos (sourceTo p) -------------------------------------------------------------------------------- -- Scanner -- This is unused because we don't use regular expressions with -- "left contexts" (see Alex docs). It is still in the code though, -- so we provide this stub. alexInputPrevChar :: a -> () alexInputPrevChar _ = () -- | Attempt to retrieve the next representative element for the character -- at the head of the input string. Returns an advanced 'AlexInput' alexGetByte :: AlexInput -> Maybe (Word8,AlexInput) alexGetByte AlexInput { input_pos = p, input_text = text } = do (c,text') <- Text.uncons text let p' = move p c x = fromIntegral (min 127 (ord c)) inp = AlexInput { input_prev = p, input_pos = p', input_text = text' } x `seq` inp `seq` return (x, inp) -- | Update a 'SourcePos' for a particular matched character move :: SourcePos -> Char -> SourcePos move (SourcePos name index line column) c = case c of '\t' -> SourcePos name (index+1) line (((column + 7) `div` 8) * 8 + 1) '\n' -> SourcePos name (index+1) (line + 1) 1 _ -> SourcePos name (index+1) line (column + 1)