module GLL.Combinators.Lexer (
default_lexer, lexer, LexerSettings(..), emptyLanguage,
) where
import GLL.Types.Abstract (Token(..), SubsumesToken(..))
import Data.Char (isSpace, isDigit, isAlpha, isUpper, isLower)
import Text.Regex.Applicative
data LexerSettings = LexerSettings {
keychars :: [Char]
, keywords :: [String]
, whitespace :: Char -> Bool
, lineComment :: String
, identifiers :: RE Char String
, altIdentifiers :: RE Char String
, tokens :: [(String, RE Char String)]
}
emptyLanguage :: LexerSettings
emptyLanguage = LexerSettings [] [] isSpace "//"
((:) <$> psym isLower <*> lowercase_id)
((:) <$> psym isUpper <*> lowercase_id)
[]
where lowercase_id = many (psym (\c -> isAlpha c || c == '_' || isDigit c))
default_lexer :: SubsumesToken t => String -> [t]
default_lexer = lexer emptyLanguage
lexer :: SubsumesToken t => LexerSettings -> String -> [t]
lexer _ [] = []
lexer lexsets s =
let re = (Just <$> lTokens lexsets)
<|> (Nothing <$ some (psym (whitespace lexsets)))
<|> (Nothing <$ string (lineComment lexsets) <* many (psym ((/=) '\n')))
in case findLongestPrefix re s of
Just (Just tok, rest) -> tok : lexer lexsets rest
Just (Nothing,rest) -> lexer lexsets rest
Nothing -> error ("lexical error at: " ++ show (take 10 s))
lTokens :: SubsumesToken t => LexerSettings -> RE Char t
lTokens lexsets =
lCharacters
<|> lKeywords
<|> charsToInt <$> optional (sym '-') <*> some (psym isDigit)
<|> upcast . IDLit . Just <$> identifiers lexsets
<|> upcast . AltIDLit . Just <$> altIdentifiers lexsets
<|> upcast . CharLit . Just <$> lCharLit
<|> upcast . StringLit . Just <$> lStringLit
<|> lMore
where
charsToInt Nothing n = upcast (IntLit (Just (read n)))
charsToInt (Just _) n = upcast (IntLit (Just ((read n))))
lChar c = upcast (Char c) <$ sym c
lCharacters = foldr ((<|>) . lChar) empty (keychars lexsets)
lKeyword k = upcast (Keyword k) <$ string k
lKeywords = foldr ((<|>) . lKeyword) empty (keywords lexsets)
lMore = foldr ((<|>) . uncurry lToken) empty (tokens lexsets)
lToken t re = upcast . Token t . Just <$> re
lStringLit = toString <$ sym '\"' <*> many strChar <* sym '\"'
where strChar = sym '\\' *> sym '\"'
<|> psym ((/=) '\"')
toString inner = read ("\"" ++ inner ++ "\"")
lCharLit = id <$ sym '\'' <*> charChar <* sym '\''
where charChar = sym '\\' *> sym '\''
<|> psym ((/=) '\'')