module Language.Lua.Lexer ( luaLexer -- * re-exports , LexicalError(..) , TokenStream(..) , runLexer , streamToList , streamToEitherList ) where import Language.Lua.Token import Data.Char (chr, isDigit, isHexDigit, isSpace) import Data.List (foldl') import Data.Maybe (fromMaybe) import Data.Monoid import Language.Lexer.Applicative import Numeric (readHex) import Text.Regex.Applicative -- | -- @ -- lex :: String -> [L Token] -- lex = 'streamToList' . 'runLexer' 'luaLexer' "" -- @ -- -- >>> lex "5+5" -- [TkIntLit "5",TkPlus,TkIntLit "5"] -- >>> lex "foo?" -- [TkIdent "foo"*** Exception: Lexical error at :1:4 luaLexer :: Lexer Token luaLexer = luaTokens <> luaWhitespace luaTokens :: Lexer Token luaTokens = token (longest luaToken) <> token (longestShortest luaLongBracketStringLitPrefix luaLongBracketStringLitSuffix) luaToken :: RE Char Token luaToken = TkAnd <$ "and" <|> TkBreak <$ "break" <|> TkDo <$ "do" <|> TkElse <$ "else" <|> TkElseif <$ "elseif" <|> TkEnd <$ "end" <|> TkFalse <$ "false" <|> TkFor <$ "for" <|> TkFunction <$ "function" <|> TkGoto <$ "goto" <|> TkIf <$ "if" <|> TkIn <$ "in" <|> TkLocal <$ "local" <|> TkNil <$ "nil" <|> TkNot <$ "not" <|> TkOr <$ "or" <|> TkRepeat <$ "repeat" <|> TkReturn <$ "return" <|> TkThen <$ "then" <|> TkTrue <$ "true" <|> TkUntil <$ "until" <|> TkWhile <$ "while" <|> TkAssign <$ "=" <|> TkNeq <$ "~=" <|> TkPlus <$ "+" <|> TkDash <$ "-" <|> TkMult <$ "*" <|> TkFloatDiv <$ "/" <|> TkModulo <$ "%" <|> TkExponent <$ "^" <|> TkLength <$ "#" <|> TkBitwiseAnd <$ "&" <|> TkBitwiseOr <$ "|" <|> TkLt <$ "<" <|> TkGt <$ ">" <|> TkLShift <$ "<<" <|> TkRShift <$ ">>" <|> TkFloorDiv <$ "//" <|> TkEq <$ "==" <|> TkLeq <$ "<=" <|> TkGeq <$ ">=" <|> TkLParen <$ "(" <|> TkRParen <$ ")" <|> TkLBrace <$ "{" <|> TkRBrace <$ "}" <|> TkLBracket <$ "[" <|> TkRBracket <$ "]" <|> TkSemi <$ ";" <|> TkColon <$ ":" <|> TkComma <$ "," <|> TkLabel <$ "::" <|> TkDot <$ "." <|> TkConcat <$ ".." <|> TkVararg <$ "..." <|> TkTilde <$ "~" <|> TkQuote <$ "'" <|> TkDoubleQuote <$ "\"" <|> TkIdent <$> luaIdentifier <|> TkStringLit <$> luaStringLit <|> TkFloatLit <$> luaFloatLit <|> TkIntLit <$> luaIntLit luaIdentifier :: RE Char String luaIdentifier = (:) <$> psym identFirst <*> many (psym identRest) where identFirst :: Char -> Bool identFirst = (||) <$> (== '_') <*> isAlpha identRest :: Char -> Bool identRest = (||) <$> identFirst <*> isDigit luaStringLit :: RE Char String luaStringLit = singleQuoted <|> doubleQuoted where singleQuoted :: RE Char String singleQuoted = quoted '\'' doubleQuoted :: RE Char String doubleQuoted = quoted '\"' quoted :: Char -> RE Char String quoted c = between c c $ many (escapeSequence <|> not_c) where not_c :: RE Char Char not_c = psym (/= c) escapeSequence :: RE Char Char escapeSequence = sym '\\' *> sequences where sequences :: RE Char Char sequences = '\a' <$ sym 'a' <|> '\b' <$ sym 'b' <|> '\f' <$ sym 'f' <|> '\n' <$ sym 'n' <|> '\r' <$ sym 'r' <|> '\t' <$ sym 't' <|> '\v' <$ sym 'v' <|> sym '\\' <|> sym '"' <|> sym '\'' <|> sym '\n' <|> hexEscape <|> decimalEscape -- TODO: unicode escape -- TODO: \z where hexEscape :: RE Char Char hexEscape = go <$> (sym 'x' *> hexDigit) <*> hexDigit where go :: Char -> Char -> Char go x y = let [(n,"")] = readHex [x,y] in chr n decimalEscape :: RE Char Char decimalEscape = chr . read <$> betweenN 1 3 digit -- unicodeEscape :: RE Char String -- unicodeEscape = (\a b c -> a ++ b ++ [c]) -- <$> "u{" -- <*> some hexDigit -- <*> sym '}' luaIntLit :: RE Char String luaIntLit = hexLit <|> some digit where hexLit :: RE Char String hexLit = (\a b c -> a:b:c) <$> sym '0' <*> oneOf "xX" <*> some hexDigit luaFloatLit :: RE Char String luaFloatLit = hexLit <|> decimalLit where hexLit :: RE Char String hexLit = (\a b cs ds -> a : b : cs ++ ds) <$> sym '0' <*> oneOf "xX" <*> some hexDigit <*> andOr (fractionalSuffix hexDigit) (exponentPart "pP") decimalLit :: RE Char String decimalLit = (++) <$> some digit <*> andOr (fractionalSuffix digit) (exponentPart "eE") exponentPart :: String -> RE Char String exponentPart cs = f <$> oneOf cs <*> optional (oneOf "-+") <*> some digit -- Yes, digit, even for binaryExponent: 0x0p+A is invalid where f :: Char -> Maybe Char -> String -> String f x Nothing zs = x:zs f x (Just y) zs = x:y:zs fractionalSuffix :: RE Char Char -> RE Char String fractionalSuffix c = (:) <$> sym '.' <*> many c luaLongBracketStringLitPrefix :: RE Char String luaLongBracketStringLitPrefix = between '[' '[' (many (sym '=')) luaLongBracketStringLitSuffix :: String -> RE Char Token luaLongBracketStringLitSuffix eqs = TkStringLit <$> -- "For convenience, when the opening long bracket is immediately followed -- by a newline, the newline is not included in the string." (optional (sym '\n') *> many anySym <* between ']' ']' (exactlyN (length eqs) (sym '='))) luaWhitespace :: Lexer Token luaWhitespace = mconcat [ whitespace $ longest (psym isSpace) , whitespace $ longestShortest luaBlockCommentPrefix luaBlockCommentSuffix , whitespace $ longest luaLineComment ] -- A comment starts with a double hyphen (--) anywhere outside a string. If the -- text immediately after -- is not an opening long bracket, the comment is a -- short comment, which runs until the end of the line. Otherwise, it is a long -- comment, which runs until the corresponding closing long bracket. Long -- comments are frequently used to disable code temporarily. luaBlockCommentPrefix :: RE Char String luaBlockCommentPrefix = "--[[" luaBlockCommentSuffix :: String -> RE Char String luaBlockCommentSuffix _ = many anySym *> "]]" luaLineComment :: RE Char String luaLineComment = "--" <|> "--\n" <|> "--" *> psym p *> many (psym (/= '\n')) <|> "--[\n" <|> "--[" *> psym p *> many (psym (/= '\n')) where p :: Char -> Bool p c = c /= '[' && c /= '\n' -------------------------------------------------------------------------------- -- Regex extras oneOf :: Eq s => [s] -> RE s s oneOf = foldl' (\acc x -> acc <|> sym x) empty -- `andOr f x y` will succeed on `x`, `y`, and `xy` andOr :: forall m. Monoid m => RE Char m -> RE Char m -> RE Char m andOr rx ry = ry <|> liftA2 (\x my -> x <> fromMaybe mempty my) rx (optional ry) between :: Char -> Char -> RE Char a -> RE Char a between x y z = sym x *> z <* sym y hexDigit :: RE Char Char hexDigit = psym isHexDigit digit :: RE Char Char digit = psym isDigit -------------------------------------------------------------------------------- -- Char extras isAlpha :: Char -> Bool isAlpha = (||) <$> isLowercase <*> isUppercase isLowercase :: Char -> Bool isLowercase = charBetween 'a' 'z' isUppercase :: Char -> Bool isUppercase = charBetween 'A' 'Z' charBetween :: Char -> Char -> Char -> Bool charBetween x y = (&&) <$> (>= x) <*> (<= y) -------------------------------------------------------------------------------- -- Applicative/Alternative extras exactlyN :: Alternative f => Int -> f a -> f [a] exactlyN 0 _ = pure [] exactlyN 1 f = pure <$> f exactlyN n f = (:) <$> f <*> exactlyN (n-1) f -- Inclusive betweenN :: Alternative f => Int -> Int -> f a -> f [a] betweenN x y _ | x > y = error "betweenN: x > y" betweenN 0 y f = upToN y f betweenN x y f = liftA2 (++) (exactlyN x f) (upToN (y-x) f) -- Inclusive upToN :: Alternative f => Int -> f a -> f [a] upToN 0 _ = pure [] upToN 1 f = fmap pure f <|> pure [] upToN n f = let g = upToN (n-1) f in liftA2 (:) f g <|> g