-- | Reference lexer for core langauge parser. Slow but Simple. -- -- The lexers here all use 'String' in place of a real name type. -- After applying these functions to the program text, we need -- to use `renameTok` tok convert the strings in `TokNamed` tokens -- into the name type specific to the langauge fragment to be parsed. -- module DDC.Core.Lexer ( module DDC.Core.Lexer.Tokens , module DDC.Core.Lexer.Names -- * Lexer , lexModuleWithOffside , lexExp) where import DDC.Core.Lexer.Offside import DDC.Core.Lexer.Comments import DDC.Core.Lexer.Names import DDC.Core.Lexer.Tokens import DDC.Data.SourcePos import DDC.Data.Token import Data.Char import Data.Text (Text) import qualified Data.Text as T import Data.Monoid -- Module ----------------------------------------------------------------------------------------- -- | Lex a module and apply the offside rule. -- -- Automatically drop comments from the token stream along the way. -- lexModuleWithOffside :: FilePath -- ^ Path to source file, for error messages. -> Int -- ^ Starting line number. -> String -- ^ String containing program text. -> [Token (Tok String)] lexModuleWithOffside sourceName lineStart str = {-# SCC lexWithOffside #-} applyOffside [] [] $ addStarts $ dropComments $ lexText sourceName lineStart $ T.pack str -- Exp -------------------------------------------------------------------------------------------- -- | Lex a string into tokens. -- -- Automatically drop comments from the token stream along the way. -- lexExp :: FilePath -- ^ Path to source file, for error messages. -> Int -- ^ Starting line number. -> String -- ^ String containing program text. -> [Token (Tok String)] lexExp sourceName lineStart str = {-# SCC lexExp #-} dropNewLines $ dropComments $ lexText sourceName lineStart $ T.pack str -- Generic ---------------------------------------------------------------------------------------- -- Tokenize some input text. -- -- NOTE: Although the main interface for the lexer uses standard Haskell strings, -- we're using Text internally to get proper unicode tokenization. -- Eventually, we should refactor the API to only pass around Text, rather -- than Strings. -- lexText :: String -- ^ Name of source file, which is attached to the tokens. -> Int -- ^ Starting line number. -> Text -- ^ Text to tokenize. -> [Token (Tok String)] lexText sourceName lineStart xx = lexWord lineStart 1 xx where lexWord :: Int -> Int -> Text -> [Token (Tok String)] lexWord line column w = match w where tok t = Token t (SourcePos sourceName line column) tokM = tok . KM tokA = tok . KA tokN = tok . KN lexMore n rest = lexWord line (column + n) rest lexUpto pat rest = case dropWhile (not . T.isPrefixOf pat) (T.tails rest) of x : _ -> x _ -> T.empty txt = T.pack prefix str = T.stripPrefix (T.pack str) match cs | T.null cs = [] -- Whitespace | Just (' ', rest) <- T.uncons cs = lexMore 1 rest | Just ('\t', rest) <- T.uncons cs = lexMore 8 rest -- Meta tokens | Just rest <- T.stripPrefix (txt "{-#") cs , (prag, rest') <- T.breakOn (txt "#-}") rest , rest'' <- T.drop 3 rest' , len <- 3 + T.length prag + 3 = tokA (KPragma prag) : lexMore len rest'' | Just rest <- T.stripPrefix (txt "{-") cs = tokM KCommentBlockStart : lexMore 2 (lexUpto (txt "-}") rest) | Just rest <- T.stripPrefix (txt "-}") cs = tokM KCommentBlockEnd : lexMore 2 rest | Just cs1 <- T.stripPrefix (txt "--") cs , (_junk, rest) <- T.span (/= '\n') cs1 = tokM KCommentLineStart : lexMore 2 rest | Just ('\n', rest) <- T.uncons cs = tokM KNewLine : lexWord (line + 1) 1 rest -- Double character symbols. | not (T.compareLength cs 2 == LT) , (cs1, rest) <- T.splitAt 2 cs , Just t <- case T.unpack cs1 of "[:" -> Just KSquareColonBra ":]" -> Just KSquareColonKet "{:" -> Just KBraceColonBra ":}" -> Just KBraceColonKet "~>" -> Just KArrowTilde "->" -> Just KArrowDash "<-" -> Just KArrowDashLeft "=>" -> Just KArrowEquals "/\\" -> Just KBigLambdaSlash "()" -> Just KDaConUnit _ -> Nothing = tokA t : lexMore 2 rest -- Wrapped operator symbols. -- This needs to come before lexing single character symbols. | Just ('(', cs1) <- T.uncons cs , Just (c, cs2) <- T.uncons cs1 , isOpStart c , (body, cs3) <- T.span isOpBody cs2 , Just (')', rest) <- T.uncons cs3 = tokA (KOpVar (T.unpack (T.cons c body))) : lexMore (2 + T.length (T.cons c body)) rest -- Literal numeric values -- This needs to come before the rule for '-' | Just (c, cs1) <- T.uncons cs , isDigit c , (body, rest) <- T.span isLitBody cs1 = let str = T.unpack (T.cons c body) in tokN (KLit str) : lexMore (length str) rest | Just ('-', cs1) <- T.uncons cs , Just (c, _) <- T.uncons cs1 , isDigit c = let (body, rest) = T.span isLitBody cs1 str = T.unpack (T.cons '-' body) in tokN (KLit str) : lexMore (length str) rest -- Literal strings. -- We force these to be null terminated so the representation is compatable -- with C string functions. | Just ('\"', cc) <- T.uncons cs = let eat n acc xs | Just ('\\', xs1) <- T.uncons xs , Just ('"', xs2) <- T.uncons xs1 = eat (n + 2) ('"' : acc) xs2 | Just ('\\', xs1) <- T.uncons xs , Just ('n', xs2) <- T.uncons xs1 = eat (n + 2) ('\n' : acc) xs2 | Just ('"', xs1) <- T.uncons xs = tokA (KString (T.pack (reverse acc))) : lexWord line (column + n) xs1 | Just (c, xs1) <- T.uncons xs = eat (n + 1) (c : acc) xs1 | otherwise = [tok $ KErrorUnterm (T.unpack cs)] in eat 0 [] cc -- Operator symbols. | Just (c, cs1) <- T.uncons cs , isOpStart c , (body, rest) <- T.span isOpBody cs1 , sym <- T.cons c body , sym /= T.pack "=" , sym /= T.pack "|" = tokA (KOp (T.unpack sym)) : lexMore (1 + T.length body) rest -- Single character symbols. | Just (c, rest) <- T.uncons cs , Just t <- case c of '(' -> Just KRoundBra ')' -> Just KRoundKet '[' -> Just KSquareBra ']' -> Just KSquareKet '{' -> Just KBraceBra '}' -> Just KBraceKet '.' -> Just KDot ',' -> Just KComma ';' -> Just KSemiColon '\\' -> Just KBackSlash '=' -> Just KEquals '|' -> Just KBar _ -> Nothing = tokA t : lexMore 1 rest -- Debruijn indices | Just ('^', cs1) <- T.uncons cs , (ds, rest) <- T.span isDigit cs1 , T.length ds >= 1 = tokA (KIndex (read (T.unpack ds))) : lexMore (1 + T.length ds) rest -- Operator body symbols. | Just ('^', rest) <- T.uncons cs = tokA KHat : lexMore 1 rest -- Lambdas | Just ('λ', rest) <- T.uncons cs = tokA KLambda : lexMore 1 rest | Just ('Λ', rest) <- T.uncons cs = tokA KBigLambda : lexMore 1 rest -- Bottoms | Just rest <- prefix "Pure" cs = tokA KBotEffect : lexMore 4 rest | Just rest <- prefix "Empty" cs = tokA KBotClosure : lexMore 5 rest -- Named Constructors | Just (c, cs1) <- T.uncons cs , isConStart c , (body, rest) <- T.span isConBody cs1 , (body', rest') <- case T.uncons rest of Just ('\'', rest') -> (body <> T.pack "'", rest') Just ('#', rest') -> (body <> T.pack "#", rest') _ -> (body, rest) = let readNamedCon s | Just socon <- readSoConBuiltin s = tokA (KSoConBuiltin socon) : lexMore (length s) rest' | Just kicon <- readKiConBuiltin s = tokA (KKiConBuiltin kicon) : lexMore (length s) rest' | Just twcon <- readTwConBuiltin s = tokA (KTwConBuiltin twcon) : lexMore (length s) rest' | Just tccon <- readTcConBuiltin s = tokA (KTcConBuiltin tccon) : lexMore (length s) rest' | Just con <- readCon s = tokN (KCon con) : lexMore (length s) rest' | otherwise = [tok (KErrorJunk [c])] in readNamedCon (T.unpack (T.cons c body')) -- Keywords, Named Variables and Witness constructors | Just (c, cs1) <- T.uncons cs , isVarStart c , (body, rest) <- T.span isVarBody cs1 , (body', rest') <- case T.uncons rest of Just ('#', rest') -> (body <> T.pack "#", rest') _ -> (body, rest) = let readNamedVar s | "_" <- s = tokA KUnderscore : lexMore (length s) rest' | Just t <- lookup s keywords = tok t : lexMore (length s) rest' | Just v <- readVar s = tokN (KVar v) : lexMore (length s) rest' | otherwise = [tok (KErrorJunk [c])] in readNamedVar (T.unpack (T.cons c body')) -- Some unrecognised character. | otherwise = case T.unpack cs of (c : _) -> [tok $ KErrorJunk [c]] _ -> [tok $ KErrorJunk []]