module Curry.Syntax.Lexer
(
Token (..), Category (..), Attributes (..)
, lexSource, lexer, fullLexer
) where
import Prelude hiding (fail)
import Data.Char
( chr, ord, isAlpha, isAlphaNum, isDigit, isHexDigit, isOctDigit
, isSpace, isUpper, toLower
)
import Data.List (intercalate)
import qualified Data.Map as Map
(Map, union, lookup, findWithDefault, fromList)
import Curry.Base.LexComb
import Curry.Base.Position
import Curry.Base.Span
data Token = Token Category Attributes
instance Eq Token where
Token c1 _ == Token c2 _ = c1 == c2
instance Ord Token where
Token c1 _ `compare` Token c2 _ = c1 `compare` c2
instance Symbol Token where
isEOF (Token c _) = c == EOF
dist _ (Token VSemicolon _) = (0, 0)
dist _ (Token VRightBrace _) = (0, 0)
dist _ (Token EOF _) = (0, 0)
dist _ (Token DotDot _) = (0, 1)
dist _ (Token DoubleColon _) = (0, 1)
dist _ (Token LeftArrow _) = (0, 1)
dist _ (Token RightArrow _) = (0, 1)
dist _ (Token DoubleArrow _) = (0, 1)
dist _ (Token KW_do _) = (0, 1)
dist _ (Token KW_if _) = (0, 1)
dist _ (Token KW_in _) = (0, 1)
dist _ (Token KW_of _) = (0, 1)
dist _ (Token Id_as _) = (0, 1)
dist _ (Token KW_let _) = (0, 2)
dist _ (Token PragmaEnd _) = (0, 2)
dist _ (Token KW_case _) = (0, 3)
dist _ (Token KW_class _) = (0, 4)
dist _ (Token KW_data _) = (0, 3)
dist _ (Token KW_default _) = (0, 6)
dist _ (Token KW_deriving _) = (0, 7)
dist _ (Token KW_else _) = (0, 3)
dist _ (Token KW_free _) = (0, 3)
dist _ (Token KW_then _) = (0, 3)
dist _ (Token KW_type _) = (0, 3)
dist _ (Token KW_fcase _) = (0, 4)
dist _ (Token KW_infix _) = (0, 4)
dist _ (Token KW_instance _) = (0, 7)
dist _ (Token KW_where _) = (0, 4)
dist _ (Token Id_ccall _) = (0, 4)
dist _ (Token KW_import _) = (0, 5)
dist _ (Token KW_infixl _) = (0, 5)
dist _ (Token KW_infixr _) = (0, 5)
dist _ (Token KW_module _) = (0, 5)
dist _ (Token Id_forall _) = (0, 5)
dist _ (Token Id_hiding _) = (0, 5)
dist _ (Token KW_newtype _) = (0, 6)
dist _ (Token KW_external _) = (0, 7)
dist _ (Token Id_interface _) = (0, 8)
dist _ (Token Id_primitive _) = (0, 8)
dist _ (Token Id_qualified _) = (0, 8)
dist _ (Token PragmaHiding _) = (0, 9)
dist _ (Token PragmaLanguage _) = (0, 11)
dist _ (Token Id a) = distAttr False a
dist _ (Token QId a) = distAttr False a
dist _ (Token Sym a) = distAttr False a
dist _ (Token QSym a) = distAttr False a
dist _ (Token IntTok a) = distAttr False a
dist _ (Token FloatTok a) = distAttr False a
dist _ (Token CharTok a) = distAttr False a
dist c (Token StringTok a) = updColDist c (distAttr False a)
dist _ (Token LineComment a) = distAttr True a
dist c (Token NestedComment a) = updColDist c (distAttr True a)
dist _ (Token PragmaOptions a) = let (ld, cd) = distAttr False a
in (ld, cd + 11)
dist _ _ = (0, 0)
updColDist :: Int -> Distance -> Distance
updColDist c (ld, cd) = (ld, if ld == 0 then cd else cd - c + 1)
distAttr :: Bool -> Attributes -> Distance
distAttr isComment attr = case attr of
NoAttributes -> (0, 0)
CharAttributes _ orig -> (0, length orig + 1)
IntAttributes _ orig -> (0, length orig - 1)
FloatAttributes _ orig -> (0, length orig - 1)
StringAttributes _ orig
| isComment -> (ld, cd)
| '\n' `elem` orig -> (ld, cd + 1)
| otherwise -> (ld, cd + 2)
where ld = length (filter (== '\n') orig)
cd = length (takeWhile (/= '\n') (reverse orig)) - 1
IdentAttributes mid i -> (0, length (intercalate "." (mid ++ [i])) - 1)
OptionsAttributes mt args -> case mt of
Nothing -> (0, distArgs + 1)
Just t -> (0, length t + distArgs + 2)
where distArgs = length args
data Category
= CharTok
| IntTok
| FloatTok
| StringTok
| Id
| QId
| Sym
| QSym
| LeftParen
| RightParen
| Semicolon
| LeftBrace
| RightBrace
| LeftBracket
| RightBracket
| Comma
| Underscore
| Backquote
| VSemicolon
| VRightBrace
| KW_case
| KW_class
| KW_data
| KW_default
| KW_deriving
| KW_do
| KW_else
| KW_external
| KW_fcase
| KW_free
| KW_if
| KW_import
| KW_in
| KW_infix
| KW_infixl
| KW_infixr
| KW_instance
| KW_let
| KW_module
| KW_newtype
| KW_of
| KW_then
| KW_type
| KW_where
| At
| Colon
| DotDot
| DoubleColon
| Equals
| Backslash
| Bar
| LeftArrow
| RightArrow
| Tilde
| DoubleArrow
| Id_as
| Id_ccall
| Id_forall
| Id_hiding
| Id_interface
| Id_primitive
| Id_qualified
| SymDot
| SymMinus
| SymStar
| PragmaLanguage
| PragmaOptions
| PragmaHiding
| PragmaMethod
| PragmaModule
| PragmaEnd
| LineComment
| NestedComment
| EOF
deriving (Eq, Ord)
data Attributes
= NoAttributes
| CharAttributes { cval :: Char , original :: String }
| IntAttributes { ival :: Integer , original :: String }
| FloatAttributes { fval :: Double , original :: String }
| StringAttributes { sval :: String , original :: String }
| IdentAttributes { modulVal :: [String] , sval :: String }
| OptionsAttributes { toolVal :: Maybe String, toolArgs :: String }
instance Show Attributes where
showsPrec _ NoAttributes = showChar '_'
showsPrec _ (CharAttributes cv _) = shows cv
showsPrec _ (IntAttributes iv _) = shows iv
showsPrec _ (FloatAttributes fv _) = shows fv
showsPrec _ (StringAttributes sv _) = shows sv
showsPrec _ (IdentAttributes mid i) = showsEscaped
$ intercalate "." $ mid ++ [i]
showsPrec _ (OptionsAttributes mt s) = showsTool mt
. showChar ' ' . showString s
where showsTool = maybe id (\t -> showChar '_' . showString t)
showsEscaped :: String -> ShowS
showsEscaped s = showChar '`' . showString s . showChar '\''
showsIdent :: Attributes -> ShowS
showsIdent a = showString "identifier " . shows a
showsSpecialIdent :: String -> ShowS
showsSpecialIdent s = showString "identifier " . showsEscaped s
showsOperator :: Attributes -> ShowS
showsOperator a = showString "operator " . shows a
showsSpecialOperator :: String -> ShowS
showsSpecialOperator s = showString "operator " . showsEscaped s
instance Show Token where
showsPrec _ (Token Id a) = showsIdent a
showsPrec _ (Token QId a) = showString "qualified "
. showsIdent a
showsPrec _ (Token Sym a) = showsOperator a
showsPrec _ (Token QSym a) = showString "qualified "
. showsOperator a
showsPrec _ (Token IntTok a) = showString "integer " . shows a
showsPrec _ (Token FloatTok a) = showString "float " . shows a
showsPrec _ (Token CharTok a) = showString "character " . shows a
showsPrec _ (Token StringTok a) = showString "string " . shows a
showsPrec _ (Token LeftParen _) = showsEscaped "("
showsPrec _ (Token RightParen _) = showsEscaped ")"
showsPrec _ (Token Semicolon _) = showsEscaped ";"
showsPrec _ (Token LeftBrace _) = showsEscaped "{"
showsPrec _ (Token RightBrace _) = showsEscaped "}"
showsPrec _ (Token LeftBracket _) = showsEscaped "["
showsPrec _ (Token RightBracket _) = showsEscaped "]"
showsPrec _ (Token Comma _) = showsEscaped ","
showsPrec _ (Token Underscore _) = showsEscaped "_"
showsPrec _ (Token Backquote _) = showsEscaped "`"
showsPrec _ (Token VSemicolon _)
= showsEscaped ";" . showString " (inserted due to layout)"
showsPrec _ (Token VRightBrace _)
= showsEscaped "}" . showString " (inserted due to layout)"
showsPrec _ (Token At _) = showsEscaped "@"
showsPrec _ (Token Colon _) = showsEscaped ":"
showsPrec _ (Token DotDot _) = showsEscaped ".."
showsPrec _ (Token DoubleArrow _) = showsEscaped "=>"
showsPrec _ (Token DoubleColon _) = showsEscaped "::"
showsPrec _ (Token Equals _) = showsEscaped "="
showsPrec _ (Token Backslash _) = showsEscaped "\\"
showsPrec _ (Token Bar _) = showsEscaped "|"
showsPrec _ (Token LeftArrow _) = showsEscaped "<-"
showsPrec _ (Token RightArrow _) = showsEscaped "->"
showsPrec _ (Token Tilde _) = showsEscaped "~"
showsPrec _ (Token SymDot _) = showsSpecialOperator "."
showsPrec _ (Token SymMinus _) = showsSpecialOperator "-"
showsPrec _ (Token SymStar _) = showsEscaped "*"
showsPrec _ (Token KW_case _) = showsEscaped "case"
showsPrec _ (Token KW_class _) = showsEscaped "class"
showsPrec _ (Token KW_data _) = showsEscaped "data"
showsPrec _ (Token KW_default _) = showsEscaped "default"
showsPrec _ (Token KW_deriving _) = showsEscaped "deriving"
showsPrec _ (Token KW_do _) = showsEscaped "do"
showsPrec _ (Token KW_else _) = showsEscaped "else"
showsPrec _ (Token KW_external _) = showsEscaped "external"
showsPrec _ (Token KW_fcase _) = showsEscaped "fcase"
showsPrec _ (Token KW_free _) = showsEscaped "free"
showsPrec _ (Token KW_if _) = showsEscaped "if"
showsPrec _ (Token KW_import _) = showsEscaped "import"
showsPrec _ (Token KW_in _) = showsEscaped "in"
showsPrec _ (Token KW_infix _) = showsEscaped "infix"
showsPrec _ (Token KW_infixl _) = showsEscaped "infixl"
showsPrec _ (Token KW_infixr _) = showsEscaped "infixr"
showsPrec _ (Token KW_instance _) = showsEscaped "instance"
showsPrec _ (Token KW_let _) = showsEscaped "let"
showsPrec _ (Token KW_module _) = showsEscaped "module"
showsPrec _ (Token KW_newtype _) = showsEscaped "newtype"
showsPrec _ (Token KW_of _) = showsEscaped "of"
showsPrec _ (Token KW_then _) = showsEscaped "then"
showsPrec _ (Token KW_type _) = showsEscaped "type"
showsPrec _ (Token KW_where _) = showsEscaped "where"
showsPrec _ (Token Id_as _) = showsSpecialIdent "as"
showsPrec _ (Token Id_ccall _) = showsSpecialIdent "ccall"
showsPrec _ (Token Id_forall _) = showsSpecialIdent "forall"
showsPrec _ (Token Id_hiding _) = showsSpecialIdent "hiding"
showsPrec _ (Token Id_interface _) = showsSpecialIdent "interface"
showsPrec _ (Token Id_primitive _) = showsSpecialIdent "primitive"
showsPrec _ (Token Id_qualified _) = showsSpecialIdent "qualified"
showsPrec _ (Token PragmaLanguage _) = showString "{-# LANGUAGE"
showsPrec _ (Token PragmaOptions a) = showString "{-# OPTIONS"
. shows a
showsPrec _ (Token PragmaHiding _) = showString "{-# HIDING"
showsPrec _ (Token PragmaMethod _) = showString "{-# METHOD"
showsPrec _ (Token PragmaModule _) = showString "{-# MODULE"
showsPrec _ (Token PragmaEnd _) = showString "#-}"
showsPrec _ (Token LineComment a) = shows a
showsPrec _ (Token NestedComment a) = shows a
showsPrec _ (Token EOF _) = showString "<end-of-file>"
tok :: Category -> Token
tok t = Token t NoAttributes
charTok :: Char -> String -> Token
charTok c o = Token CharTok CharAttributes { cval = c, original = o }
intTok :: Integer -> String -> Token
intTok base digits = Token IntTok IntAttributes
{ ival = convertIntegral base digits, original = digits }
floatTok :: String -> String -> Int -> String -> Token
floatTok mant frac expo rest = Token FloatTok FloatAttributes
{ fval = convertFloating mant frac expo
, original = mant ++ "." ++ frac ++ rest }
stringTok :: String -> String -> Token
stringTok cs s = Token StringTok StringAttributes { sval = cs, original = s }
idTok :: Category -> [String] -> String -> Token
idTok t mIdent ident = Token t
IdentAttributes { modulVal = mIdent, sval = ident }
pragmaOptionsTok :: Maybe String -> String -> Token
pragmaOptionsTok mbTool s = Token PragmaOptions
OptionsAttributes { toolVal = mbTool, toolArgs = s }
lineCommentTok :: String -> Token
lineCommentTok s = Token LineComment
StringAttributes { sval = s, original = s }
nestedCommentTok :: String -> Token
nestedCommentTok s = Token NestedComment
StringAttributes { sval = s, original = s }
reservedOps:: Map.Map String Category
reservedOps = Map.fromList
[ ("@" , At )
, (":" , Colon )
, ("=>", DoubleArrow)
, ("::", DoubleColon)
, ("..", DotDot )
, ("=" , Equals )
, ("\\", Backslash )
, ("|" , Bar )
, ("<-", LeftArrow )
, ("->", RightArrow )
, ("~" , Tilde )
]
reservedSpecialOps :: Map.Map String Category
reservedSpecialOps = Map.union reservedOps $ Map.fromList
[ ("." , SymDot )
, ("-" , SymMinus )
, ("*" , SymStar )
]
keywords :: Map.Map String Category
keywords = Map.fromList
[ ("case" , KW_case )
, ("class" , KW_class )
, ("data" , KW_data )
, ("default" , KW_default )
, ("deriving", KW_deriving)
, ("do" , KW_do )
, ("else" , KW_else )
, ("external", KW_external)
, ("fcase" , KW_fcase )
, ("free" , KW_free )
, ("if" , KW_if )
, ("import" , KW_import )
, ("in" , KW_in )
, ("infix" , KW_infix )
, ("infixl" , KW_infixl )
, ("infixr" , KW_infixr )
, ("instance", KW_instance)
, ("let" , KW_let )
, ("module" , KW_module )
, ("newtype" , KW_newtype )
, ("of" , KW_of )
, ("then" , KW_then )
, ("type" , KW_type )
, ("where" , KW_where )
]
keywordsSpecialIds :: Map.Map String Category
keywordsSpecialIds = Map.union keywords $ Map.fromList
[ ("as" , Id_as )
, ("ccall" , Id_ccall )
, ("forall" , Id_forall )
, ("hiding" , Id_hiding )
, ("interface", Id_interface)
, ("primitive", Id_primitive)
, ("qualified", Id_qualified)
]
pragmas :: Map.Map String Category
pragmas = Map.fromList
[ ("language", PragmaLanguage)
, ("options" , PragmaOptions )
, ("hiding" , PragmaHiding )
, ("method" , PragmaMethod )
, ("module" , PragmaModule )
]
isIdentChar :: Char -> Bool
isIdentChar c = isAlphaNum c || c `elem` "'_"
isSymbolChar :: Char -> Bool
isSymbolChar c = c `elem` "~!@#$%^&*+-=<>:?./|\\"
lexSource :: FilePath -> String -> CYM [(Span, Token)]
lexSource = parse (applyLexer fullLexer)
lexer :: Lexer Token a
lexer = skipWhiteSpace True
fullLexer :: Lexer Token a
fullLexer = skipWhiteSpace False
skipWhiteSpace :: Bool -> Lexer Token a
skipWhiteSpace skipComments suc fail = skip
where
skip sp [] bol = suc sp (tok EOF) sp [] bol
skip sp c@('-':'-':_) _ = lexLineComment sucComment fail sp c True
skip sp c@('{':'-':'#':_) bol = lexPragma noPragma suc fail sp c bol
skip sp c@('{':'-':_) bol = lexNestedComment sucComment fail sp c bol
skip sp cs@(c:s) bol
| c == '\t' = warnP sp "Tab character" skip (tabSpan sp) s bol
| c == '\n' = skip (nlSpan sp) s True
| isSpace c = skip (nextSpan sp) s bol
| bol = lexBOL suc fail sp cs bol
| otherwise = lexToken suc fail sp cs bol
sucComment = if skipComments then (\ _suc _fail -> skip) else suc
noPragma = lexNestedComment sucComment fail
lexLineComment :: Lexer Token a
lexLineComment suc _ sp str = case break (== '\n') str of
(c, s ) -> suc sp (lineCommentTok c) (incrSpan sp $ length c) s
lexPragma :: P a -> Lexer Token a
lexPragma noPragma suc fail sp0 str = pragma (incrSpan sp0 3) (drop 3 str)
where
skip = noPragma sp0 str
pragma sp [] = fail sp0 "Unterminated pragma" sp []
pragma sp cs@(c : s)
| c == '\t' = pragma (tabSpan sp) s
| c == '\n' = pragma (nlSpan sp) s
| isSpace c = pragma (nextSpan sp) s
| isAlpha c = case Map.lookup (map toLower prag) pragmas of
Nothing -> skip
Just PragmaOptions -> lexOptionsPragma sp0 suc fail sp1 rest
Just t -> suc sp0 (tok t) sp1 rest
| otherwise = skip
where
(prag, rest) = span isAlphaNum cs
sp1 = incrSpan sp (length prag)
lexOptionsPragma :: Span -> Lexer Token a
lexOptionsPragma sp0 _ fail sp [] = fail sp0 "Unterminated Options pragma" sp []
lexOptionsPragma sp0 suc fail sp (c : s)
| c == '\t' = lexArgs Nothing (tabSpan sp) s
| c == '\n' = lexArgs Nothing (nlSpan sp) s
| isSpace c = lexArgs Nothing (nextSpan sp) s
| c == '_' = let (tool, s1) = span isIdentChar s
in lexArgs (Just tool) (incrSpan sp (length tool + 1)) s1
| otherwise = fail sp0 "Malformed Options pragma" sp s
where
lexArgs mbTool = lexRaw ""
where
lexRaw s0 sp1 r = case hash of
[] -> fail sp0 "End-of-file inside pragma" (incrSpan sp1 len) []
'#':'-':'}':_ -> token (trim $ s0 ++ opts) (incrSpan sp1 len) hash
_ -> lexRaw (s0 ++ opts ++ "#") (incrSpan sp1 (len + 1)) (drop 1 hash)
where
(opts, hash) = span (/= '#') r
len = length opts
token = suc sp0 . pragmaOptionsTok mbTool
trim = reverse . dropWhile isSpace . reverse . dropWhile isSpace
lexNestedComment :: Lexer Token a
lexNestedComment suc fail sp0 = lnc (0 :: Integer) id sp0
where
lnc d comm sp str = case (d, str) of
(_, []) -> fail sp0 "Unterminated nested comment" sp []
(1, '-':'}':s) -> suc sp0 (nestedCommentTok (comm "-}")) (incrSpan sp 2) s
(_, '{':'-':s) -> cont (d+1) ("{-" ++) (incrSpan sp 2) s
(_, '-':'}':s) -> cont (d-1) ("-}" ++) (incrSpan sp 2) s
(_, c@'\t' :s) -> cont d (c:) (tabSpan sp) s
(_, c@'\n' :s) -> cont d (c:) (nlSpan sp) s
(_, c :s) -> cont d (c:) (nextSpan sp) s
where cont d' comm' = lnc d' (comm . comm')
lexBOL :: Lexer Token a
lexBOL suc fail sp s _ [] = lexToken suc fail sp s False []
lexBOL suc fail sp s _ ctxt@(n:rest)
| col < n = suc sp (tok VRightBrace) sp s True rest
| col == n = suc sp (tok VSemicolon) sp s False ctxt
| otherwise = lexToken suc fail sp s False ctxt
where col = column (span2Pos sp)
lexToken :: Lexer Token a
lexToken suc _ sp [] = suc sp (tok EOF) sp []
lexToken suc fail sp cs@(c:s)
| take 3 cs == "#-}" = suc sp (tok PragmaEnd) (incrSpan sp 3) (drop 3 cs)
| c == '(' = token LeftParen
| c == ')' = token RightParen
| c == ',' = token Comma
| c == ';' = token Semicolon
| c == '[' = token LeftBracket
| c == ']' = token RightBracket
| c == '_' = token Underscore
| c == '`' = token Backquote
| c == '{' = token LeftBrace
| c == '}' = lexRightBrace (suc sp) (nextSpan sp) s
| c == '\'' = lexChar sp suc fail (nextSpan sp) s
| c == '\"' = lexString sp suc fail (nextSpan sp) s
| isAlpha c = lexIdent (suc sp) sp cs
| isSymbolChar c = lexSymbol (suc sp) sp cs
| isDigit c = lexNumber (suc sp) sp cs
| otherwise = fail sp ("Illegal character " ++ show c) sp s
where token t = suc sp (tok t) (nextSpan sp) s
lexRightBrace :: (Token -> P a) -> P a
lexRightBrace cont sp s bol ctxt = cont (tok RightBrace) sp s bol (drop 1 ctxt)
lexIdent :: (Token -> P a) -> P a
lexIdent cont sp s = maybe (lexOptQual cont (token Id) [ident]) (cont . token)
(Map.lookup ident keywordsSpecialIds)
(incrSpan sp $ length ident) rest
where (ident, rest) = span isIdentChar s
token t = idTok t [] ident
lexSymbol :: (Token -> P a) -> P a
lexSymbol cont sp s = cont
(idTok (Map.findWithDefault Sym sym reservedSpecialOps) [] sym)
(incrSpan sp $ length sym) rest
where (sym, rest) = span isSymbolChar s
lexOptQual :: (Token -> P a) -> Token -> [String] -> P a
lexOptQual cont token mIdent sp cs@('.':c:s)
| isAlpha c = lexQualIdent cont identCont mIdent
(nextSpan sp) (c:s)
| isSymbolChar c && c /= '.' = lexQualSymbol cont identCont mIdent
(nextSpan sp) (c:s)
where identCont _ _ = cont token sp cs
lexOptQual cont token mIdent sp cs@('.':'.':c:s)
| isSymbolChar c = lexQualSymbol cont identCont mIdent
(nextSpan sp) ('.':c:s)
where identCont _ _ = cont token sp cs
lexOptQual cont token _ sp cs = cont token sp cs
lexQualIdent :: (Token -> P a) -> P a -> [String] -> P a
lexQualIdent cont identCont mIdent sp s =
maybe (lexOptQual cont (idTok QId mIdent ident) (mIdent ++ [ident]))
(const identCont)
(Map.lookup ident keywords)
(incrSpan sp (length ident)) rest
where (ident, rest) = span isIdentChar s
lexQualSymbol :: (Token -> P a) -> P a -> [String] -> P a
lexQualSymbol cont identCont mIdent sp s =
maybe (cont (idTok QSym mIdent sym)) (const identCont)
(Map.lookup sym reservedOps)
(incrSpan sp (length sym)) rest
where (sym, rest) = span isSymbolChar s
lexNumber :: (Token -> P a) -> P a
lexNumber cont sp ('0':c:s)
| c `elem` "bB" = lexBinary cont nullCont (incrSpan sp 2) s
| c `elem` "oO" = lexOctal cont nullCont (incrSpan sp 2) s
| c `elem` "xX" = lexHexadecimal cont nullCont (incrSpan sp 2) s
where nullCont _ _ = cont (intTok 10 "0") (nextSpan sp) (c:s)
lexNumber cont sp s = lexOptFraction cont (intTok 10 digits) digits
(incrSpan sp $ length digits) rest
where (digits, rest) = span isDigit s
lexBinary :: (Token -> P a) -> P a -> P a
lexBinary cont nullCont sp s
| null digits = nullCont undefined undefined
| otherwise = cont (intTok 2 digits) (incrSpan sp $ length digits) rest
where (digits, rest) = span isBinDigit s
isBinDigit c = c >= '0' && c <= '1'
lexOctal :: (Token -> P a) -> P a -> P a
lexOctal cont nullCont sp s
| null digits = nullCont undefined undefined
| otherwise = cont (intTok 8 digits) (incrSpan sp $ length digits) rest
where (digits, rest) = span isOctDigit s
lexHexadecimal :: (Token -> P a) -> P a -> P a
lexHexadecimal cont nullCont sp s
| null digits = nullCont undefined undefined
| otherwise = cont (intTok 16 digits) (incrSpan sp $ length digits) rest
where (digits, rest) = span isHexDigit s
lexOptFraction :: (Token -> P a) -> Token -> String -> P a
lexOptFraction cont _ mant sp ('.':c:s)
| isDigit c = lexOptExponent cont (floatTok mant frac 0 "") mant frac
(incrSpan sp (length frac+1)) rest
where (frac,rest) = span isDigit (c:s)
lexOptFraction cont token mant sp (c:s)
| c `elem` "eE" = lexSignedExponent cont intCont mant "" [c] (nextSpan sp) s
where intCont _ _ = cont token sp (c:s)
lexOptFraction cont token _ sp s = cont token sp s
lexOptExponent :: (Token -> P a) -> Token -> String -> String -> P a
lexOptExponent cont token mant frac sp (c:s)
| c `elem` "eE" = lexSignedExponent cont floatCont mant frac [c] (nextSpan sp) s
where floatCont _ _ = cont token sp (c:s)
lexOptExponent cont token _ _ sp s = cont token sp s
lexSignedExponent :: (Token -> P a) -> P a -> String -> String -> String
-> P a
lexSignedExponent cont floatCont mant frac e sp str = case str of
('+':c:s) | isDigit c -> lexExpo (e ++ "+") id (nextSpan sp) (c:s)
('-':c:s) | isDigit c -> lexExpo (e ++ "-") negate (nextSpan sp) (c:s)
(c:_) | isDigit c -> lexExpo e id sp str
_ -> floatCont sp str
where lexExpo = lexExponent cont mant frac
lexExponent :: (Token -> P a) -> String -> String -> String -> (Int -> Int)
-> P a
lexExponent cont mant frac e expSign sp s =
cont (floatTok mant frac expo (e ++ digits)) (incrSpan sp $ length digits) rest
where (digits, rest) = span isDigit s
expo = expSign (convertIntegral 10 digits)
lexChar :: Span -> Lexer Token a
lexChar sp0 _ fail sp [] = fail sp0 "Illegal character constant" sp []
lexChar sp0 success fail sp (c:s)
| c == '\\' = lexEscape sp (\d o -> lexCharEnd d o sp0 success fail)
fail (nextSpan sp) s
| c == '\n' = fail sp0 "Illegal character constant" sp (c:s)
| c == '\t' = lexCharEnd c "\t" sp0 success fail (tabSpan sp) s
| otherwise = lexCharEnd c [c] sp0 success fail (nextSpan sp) s
lexCharEnd :: Char -> String -> Span -> Lexer Token a
lexCharEnd c o sp0 suc _ sp ('\'':s) = suc sp0 (charTok c o) (nextSpan sp) s
lexCharEnd _ _ sp0 _ fail sp s =
fail sp0 "Improperly terminated character constant" sp s
lexString :: Span -> Lexer Token a
lexString sp0 suc fail = lexStringRest "" id
where
lexStringRest _ _ sp [] = improperTermination sp
lexStringRest s0 so sp (c:s)
| c == '\n' = improperTermination sp
| c == '\"' = suc sp0 (stringTok (reverse s0) (so "")) (nextSpan sp) s
| c == '\\' = lexStringEscape sp s0 so lexStringRest fail (nextSpan sp) s
| c == '\t' = lexStringRest (c:s0) (so . (c:)) (tabSpan sp) s
| otherwise = lexStringRest (c:s0) (so . (c:)) (nextSpan sp) s
improperTermination sp = fail sp0 "Improperly terminated string constant" sp []
lexStringEscape :: Span -> String -> (String -> String)
-> (String -> (String -> String) -> P a)
-> FailP a -> P a
lexStringEscape sp0 _ _ _ fail sp [] = lexEscape sp0 undefined fail sp []
lexStringEscape sp0 s0 so suc fail sp cs@(c:s)
| c == '&' = suc s0 (so . ("\\&" ++)) (nextSpan sp) s
| isSpace c = lexStringGap so (suc s0) fail sp cs
| otherwise = lexEscape sp0 (\ c' s' -> suc (c': s0) (so . (s' ++))) fail sp cs
lexStringGap :: (String -> String) -> ((String -> String) -> P a)
-> FailP a -> P a
lexStringGap _ _ fail sp [] = fail sp "End-of-file in string gap" sp []
lexStringGap so suc fail sp (c:s)
| c == '\\' = suc (so . (c:)) (nextSpan sp) s
| c == '\t' = lexStringGap (so . (c:)) suc fail (tabSpan sp) s
| c == '\n' = lexStringGap (so . (c:)) suc fail (nlSpan sp) s
| isSpace c = lexStringGap (so . (c:)) suc fail (nextSpan sp) s
| otherwise = fail sp ("Illegal character in string gap: " ++ show c) sp s
lexEscape :: Span -> (Char -> String -> P a) -> FailP a -> P a
lexEscape sp0 suc fail sp str = case str of
('a' :s) -> suc '\a' "\\a" (nextSpan sp) s
('b' :s) -> suc '\b' "\\b" (nextSpan sp) s
('f' :s) -> suc '\f' "\\f" (nextSpan sp) s
('n' :s) -> suc '\n' "\\n" (nextSpan sp) s
('r' :s) -> suc '\r' "\\r" (nextSpan sp) s
('t' :s) -> suc '\t' "\\t" (nextSpan sp) s
('v' :s) -> suc '\v' "\\v" (nextSpan sp) s
('\\':s) -> suc '\\' "\\\\" (nextSpan sp) s
('"' :s) -> suc '\"' "\\\"" (nextSpan sp) s
('\'':s) -> suc '\'' "\\\'" (nextSpan sp) s
('^':c:s) | isControlEsc c -> controlEsc c (incrSpan sp 2) s
('o':c:s) | isOctDigit c -> numEsc 8 isOctDigit ("\\o" ++) (nextSpan sp) (c:s)
('x':c:s) | isHexDigit c -> numEsc 16 isHexDigit ("\\x" ++) (nextSpan sp) (c:s)
(c:s) | isDigit c -> numEsc 10 isDigit ("\\" ++) sp (c:s)
_ -> asciiEscape sp0 suc fail sp str
where numEsc = numEscape sp0 suc fail
controlEsc c = suc (chr (ord c `mod` 32)) ("\\^" ++ [c])
isControlEsc c = isUpper c || c `elem` "@[\\]^_"
numEscape :: Span -> (Char -> String -> P a) -> FailP a -> Int
-> (Char -> Bool) -> (String -> String) -> P a
numEscape sp0 suc fail b isDigit' so sp s
| n >= ord minBound && n <= ord maxBound
= suc (chr n) (so digits) (incrSpan sp $ length digits) rest
| otherwise
= fail sp0 "Numeric escape out-of-range" sp s
where (digits, rest) = span isDigit' s
n = convertIntegral b digits
asciiEscape :: Span -> (Char -> String -> P a) -> FailP a -> P a
asciiEscape sp0 suc fail sp str = case str of
('N':'U':'L':s) -> suc '\NUL' "\\NUL" (incrSpan sp 3) s
('S':'O':'H':s) -> suc '\SOH' "\\SOH" (incrSpan sp 3) s
('S':'T':'X':s) -> suc '\STX' "\\STX" (incrSpan sp 3) s
('E':'T':'X':s) -> suc '\ETX' "\\ETX" (incrSpan sp 3) s
('E':'O':'T':s) -> suc '\EOT' "\\EOT" (incrSpan sp 3) s
('E':'N':'Q':s) -> suc '\ENQ' "\\ENQ" (incrSpan sp 3) s
('A':'C':'K':s) -> suc '\ACK' "\\ACK" (incrSpan sp 3) s
('B':'E':'L':s) -> suc '\BEL' "\\BEL" (incrSpan sp 3) s
('B':'S' :s) -> suc '\BS' "\\BS" (incrSpan sp 2) s
('H':'T' :s) -> suc '\HT' "\\HT" (incrSpan sp 2) s
('L':'F' :s) -> suc '\LF' "\\LF" (incrSpan sp 2) s
('V':'T' :s) -> suc '\VT' "\\VT" (incrSpan sp 2) s
('F':'F' :s) -> suc '\FF' "\\FF" (incrSpan sp 2) s
('C':'R' :s) -> suc '\CR' "\\CR" (incrSpan sp 2) s
('S':'O' :s) -> suc '\SO' "\\SO" (incrSpan sp 2) s
('S':'I' :s) -> suc '\SI' "\\SI" (incrSpan sp 2) s
('D':'L':'E':s) -> suc '\DLE' "\\DLE" (incrSpan sp 3) s
('D':'C':'1':s) -> suc '\DC1' "\\DC1" (incrSpan sp 3) s
('D':'C':'2':s) -> suc '\DC2' "\\DC2" (incrSpan sp 3) s
('D':'C':'3':s) -> suc '\DC3' "\\DC3" (incrSpan sp 3) s
('D':'C':'4':s) -> suc '\DC4' "\\DC4" (incrSpan sp 3) s
('N':'A':'K':s) -> suc '\NAK' "\\NAK" (incrSpan sp 3) s
('S':'Y':'N':s) -> suc '\SYN' "\\SYN" (incrSpan sp 3) s
('E':'T':'B':s) -> suc '\ETB' "\\ETB" (incrSpan sp 3) s
('C':'A':'N':s) -> suc '\CAN' "\\CAN" (incrSpan sp 3) s
('E':'M' :s) -> suc '\EM' "\\EM" (incrSpan sp 2) s
('S':'U':'B':s) -> suc '\SUB' "\\SUB" (incrSpan sp 3) s
('E':'S':'C':s) -> suc '\ESC' "\\ESC" (incrSpan sp 3) s
('F':'S' :s) -> suc '\FS' "\\FS" (incrSpan sp 2) s
('G':'S' :s) -> suc '\GS' "\\GS" (incrSpan sp 2) s
('R':'S' :s) -> suc '\RS' "\\RS" (incrSpan sp 2) s
('U':'S' :s) -> suc '\US' "\\US" (incrSpan sp 2) s
('S':'P' :s) -> suc '\SP' "\\SP" (incrSpan sp 2) s
('D':'E':'L':s) -> suc '\DEL' "\\DEL" (incrSpan sp 3) s
s -> fail sp0 "Illegal escape sequence" sp s