src/GLL/Combinators/Lexer.hs


module GLL.Combinators.Lexer (
    default_lexer, lexer, LexerSettings(..), emptyLanguage,
    ) where

import GLL.Types.Abstract (Token(..), SubsumesToken(..))
import Data.Char (isSpace, isDigit, isAlpha, isUpper, isLower)
import Text.Regex.Applicative

-- | Settings for changing the behaviour of the builtin lexer 'lexer'.
-- Lexers are built using "Text.Regex.Applicative".
data LexerSettings = LexerSettings {
        -- | Which keychars to recognise? Default: none.
        keychars        :: [Char]
        -- | Which keywords to recognise? Default: none.
    ,   keywords        :: [String]
        -- | What is considered a whitespace character? Default: 'Data.Char.isSpace'.
    ,   whitespace      :: Char -> Bool
        -- | How does a line comment start? Default: '"'//'"'.
    ,   lineComment     :: String
        -- | How to recognise identifiers? Default alphanumerical with lowercase alpha start.
    ,   identifiers     :: RE Char String
        -- | How to recognise alternative identifiers? Default alphanumerical with uppercase alpha start.
    ,   altIdentifiers  :: RE Char String
        -- | Arbitrary tokens /(a,b)/. /a/ is the token name, /b/ is a regular expression.
    ,   tokens          :: [(String, RE Char String)]
    }

-- | The default 'LexerSettings'.
emptyLanguage :: LexerSettings
emptyLanguage = LexerSettings [] [] isSpace "//"
    ((:) <$> psym isLower <*> lowercase_id)
    ((:) <$> psym isUpper <*> lowercase_id)
    []
 where lowercase_id = many (psym (\c -> isAlpha c || c == '_' || isDigit c))

-- | A lexer using the default 'LexerSettings'.
default_lexer :: SubsumesToken t => String -> [t]
default_lexer = lexer emptyLanguage 

-- | A lexer parameterised by 'LexerSettings'.
lexer :: SubsumesToken t => LexerSettings -> String -> [t]
lexer _ [] = []
lexer lexsets s =
    let re =    (Just <$> lTokens lexsets)
            <|> (Nothing <$ some (psym (whitespace lexsets)))
            <|> (Nothing <$ string (lineComment lexsets) <* many (psym ((/=) '\n')))
    in case findLongestPrefix re s of
        Just (Just tok, rest)   -> tok : lexer lexsets rest
        Just (Nothing,rest)     -> lexer lexsets rest
        Nothing                 -> error ("lexical error at: " ++ show (take 10 s))

lTokens :: SubsumesToken t => LexerSettings -> RE Char t 
lTokens lexsets =
        lCharacters
    <|> lKeywords
    <|> charsToInt  <$> optional (sym '-') <*> some (psym isDigit)
    <|> upcast . IDLit . Just <$> identifiers lexsets 
    <|> upcast . AltIDLit . Just <$> altIdentifiers lexsets
    <|> upcast . CharLit . Just <$> lCharLit
    <|> upcast . StringLit . Just <$> lStringLit
    <|> lMore
    where   
            charsToInt Nothing n = upcast (IntLit (Just (read n)))
            charsToInt (Just _) n = upcast (IntLit (Just (-(read n))))

            lChar c = upcast (Char c) <$ sym c
            lCharacters = foldr ((<|>) . lChar) empty (keychars lexsets) 

            lKeyword k  = upcast (Keyword k) <$ string k
            lKeywords = foldr ((<|>) . lKeyword) empty (keywords lexsets)

            lMore = foldr ((<|>) . uncurry lToken) empty (tokens lexsets)
            lToken t re = upcast . Token t . Just <$> re

            lStringLit = toString <$ sym '\"' <*> many strChar <* sym '\"'
             where strChar =  sym '\\' *> sym '\"'
                              <|> psym ((/=) '\"')
                   toString inner = read ("\"" ++ inner ++ "\"")

            lCharLit = id <$ sym '\'' <*> charChar <* sym '\''
              where charChar = sym '\\' *> sym '\''
                                <|> psym ((/=) '\'')