-- File generated by the BNF Converter. -- -*- haskell -*- -- Lexer definition for use with Alex 3. { {-# OPTIONS -fno-warn-incomplete-patterns #-} {-# OPTIONS_GHC -w #-} {-# LANGUAGE PatternSynonyms #-} module LexC where import Prelude import qualified Data.Bits import Data.Char (ord) import Data.Function (on) import Data.Maybe (fromMaybe) import qualified Data.Map as Map import Data.Map (Map) import Data.Word (Word8) } -- Predefined character classes $c = [A-Z\192-\221] # [\215] -- capital isolatin1 letter (215 = \times) $s = [a-z\222-\255] # [\247] -- small isolatin1 letter (247 = \div ) $l = [$c $s] -- letter $d = [0-9] -- digit $i = [$l $d _ '] -- identifier character $u = [. \n] -- universal: any character -- Symbols and non-identifier-like reserved words @rsyms = \! | \! \= | \% | \% \= | \& | \& \& | \& \= | \( | \) | \* | \* \= | \+ | \+ \+ | \+ \= | \, | \- | \- \- | \- \= | \- \> | \. | \. \. \. | \/ | \/ \= | \: | \; | \< | \< \< | \< \< \= | \< \= | \= | \= \= | \> | \> \= | \> \> | \> \> \= | \? | \[ | \] | \^ | \^ \= | \{ | \| | \| \= | \| \| | \} | \~ :- -- Line comment "//" "//" [.]* ; -- Line comment "#" "#" [.]* ; -- Block comment "/*" "*/" \/ \* [ $u # \* ]* \* ([ $u # [ \/ \* ] ] [ $u # \* ]* \* | \*)* \/; -- Whitespace (skipped) $white+ ; -- Symbols @rsyms { tok (eitherResIdent TV) } -- token CDouble ($d+ \. | \. $d+) ([ e E ] \-? $d+)? | $d+ [ e E ] \-? $d+ | $d+ \. $d+ E \-? $d+ { tok (eitherResIdent T_CDouble) } -- token CFloat ($d+ \. $d+ | $d+ \. | \. $d+) ([ e E ] \-? $d+)? [ f F ] | $d+ [ e E ] \-? $d+ [ f F ] { tok (eitherResIdent T_CFloat) } -- token CLongDouble ($d+ \. $d+ | $d+ \. | \. $d+) ([ e E ] \-? $d+)? [ l L ] | $d+ [ e E ] \-? $d+ [ l L ] { tok (eitherResIdent T_CLongDouble) } -- token HexLong 0 [ x X ] [ $d a b c d e f A B C D E F ]+ [ l L ] { tok (eitherResIdent T_HexLong) } -- token HexUnsLong 0 [ x X ] [ $d a b c d e f A B C D E F ]+ (u l | U L) { tok (eitherResIdent T_HexUnsLong) } -- token HexUnsigned 0 [ x X ] [ $d a b c d e f A B C D E F ]+ [ u U ] { tok (eitherResIdent T_HexUnsigned) } -- token Hexadecimal 0 [ x X ] [ $d a b c d e f A B C D E F ]+ { tok (eitherResIdent T_Hexadecimal) } -- token Long [ 1 2 3 4 5 6 7 8 9 ] $d* [ l L ] { tok (eitherResIdent T_Long) } -- token Octal 0 [ 0 1 2 3 4 5 6 7 ]* { tok (eitherResIdent T_Octal) } -- token OctalLong 0 [ 0 1 2 3 4 5 6 7 ]* [ l L ] { tok (eitherResIdent T_OctalLong) } -- token OctalUnsLong 0 [ 0 1 2 3 4 5 6 7 ]* (u l | U L) { tok (eitherResIdent T_OctalUnsLong) } -- token OctalUnsigned 0 [ 0 1 2 3 4 5 6 7 ]* [ u U ] { tok (eitherResIdent T_OctalUnsigned) } -- token Unsigned [ 1 2 3 4 5 6 7 8 9 ] $d* [ u U ] { tok (eitherResIdent T_Unsigned) } -- token UnsignedLong [ 1 2 3 4 5 6 7 8 9 ] $d* (u l | U L) { tok (eitherResIdent T_UnsignedLong) } -- Keywords and Ident $l $i* { tok (eitherResIdent TV) } -- String \" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t | r | f)))* \" { tok (TL . unescapeInitTail) } -- Char \' ($u # [\' \\] | \\ [\\ \' n t r f]) \' { tok TC } -- Integer $d+ { tok TI } -- Double $d+ \. $d+ (e (\-)? $d+)? { tok TD } { -- | Create a token with position. tok :: (String -> Tok) -> (Posn -> String -> Token) tok f p = PT p . f -- | Token without position. data Tok = TK {-# UNPACK #-} !TokSymbol -- ^ Reserved word or symbol. | TL !String -- ^ String literal. | TI !String -- ^ Integer literal. | TV !String -- ^ Identifier. | TD !String -- ^ Float literal. | TC !String -- ^ Character literal. | T_Unsigned !String | T_Long !String | T_UnsignedLong !String | T_Hexadecimal !String | T_HexUnsigned !String | T_HexLong !String | T_HexUnsLong !String | T_Octal !String | T_OctalUnsigned !String | T_OctalLong !String | T_OctalUnsLong !String | T_CDouble !String | T_CFloat !String | T_CLongDouble !String deriving (Eq, Show, Ord) -- | Smart constructor for 'Tok' for the sake of backwards compatibility. pattern TS :: String -> Int -> Tok pattern TS t i = TK (TokSymbol t i) -- | Keyword or symbol tokens have a unique ID. data TokSymbol = TokSymbol { tsText :: String -- ^ Keyword or symbol text. , tsID :: !Int -- ^ Unique ID. } deriving (Show) -- | Keyword/symbol equality is determined by the unique ID. instance Eq TokSymbol where (==) = (==) `on` tsID -- | Keyword/symbol ordering is determined by the unique ID. instance Ord TokSymbol where compare = compare `on` tsID -- | Token with position. data Token = PT Posn Tok | Err Posn deriving (Eq, Show, Ord) -- | Pretty print a position. printPosn :: Posn -> String printPosn (Pn _ l c) = "line " ++ show l ++ ", column " ++ show c -- | Pretty print the position of the first token in the list. tokenPos :: [Token] -> String tokenPos (t:_) = printPosn (tokenPosn t) tokenPos [] = "end of file" -- | Get the position of a token. tokenPosn :: Token -> Posn tokenPosn (PT p _) = p tokenPosn (Err p) = p -- | Get line and column of a token. tokenLineCol :: Token -> (Int, Int) tokenLineCol = posLineCol . tokenPosn -- | Get line and column of a position. posLineCol :: Posn -> (Int, Int) posLineCol (Pn _ l c) = (l,c) -- | Convert a token into "position token" form. mkPosToken :: Token -> ((Int, Int), String) mkPosToken t = (tokenLineCol t, tokenText t) -- | Convert a token to its text. tokenText :: Token -> String tokenText t = case t of PT _ (TS s _) -> s PT _ (TL s) -> show s PT _ (TI s) -> s PT _ (TV s) -> s PT _ (TD s) -> s PT _ (TC s) -> s Err _ -> "#error" PT _ (T_Unsigned s) -> s PT _ (T_Long s) -> s PT _ (T_UnsignedLong s) -> s PT _ (T_Hexadecimal s) -> s PT _ (T_HexUnsigned s) -> s PT _ (T_HexLong s) -> s PT _ (T_HexUnsLong s) -> s PT _ (T_Octal s) -> s PT _ (T_OctalUnsigned s) -> s PT _ (T_OctalLong s) -> s PT _ (T_OctalUnsLong s) -> s PT _ (T_CDouble s) -> s PT _ (T_CFloat s) -> s PT _ (T_CLongDouble s) -> s -- | Convert a token to a string. prToken :: Token -> String prToken t = tokenText t -- | Convert potential keyword into token or use fallback conversion. eitherResIdent :: (String -> Tok) -> String -> Tok eitherResIdent tv s = fromMaybe (tv s) (Map.lookup s resWords) -- | The keywords and symbols of the language organized as a Map. resWords :: Map String Tok resWords = Map.fromAscList [ ("!", TS "!" 1) , ("!=", TS "!=" 2) , ("%", TS "%" 3) , ("%=", TS "%=" 4) , ("&", TS "&" 5) , ("&&", TS "&&" 6) , ("&=", TS "&=" 7) , ("(", TS "(" 8) , (")", TS ")" 9) , ("*", TS "*" 10) , ("*=", TS "*=" 11) , ("+", TS "+" 12) , ("++", TS "++" 13) , ("+=", TS "+=" 14) , (",", TS "," 15) , ("-", TS "-" 16) , ("--", TS "--" 17) , ("-=", TS "-=" 18) , ("->", TS "->" 19) , (".", TS "." 20) , ("...", TS "..." 21) , ("/", TS "/" 22) , ("/=", TS "/=" 23) , (":", TS ":" 24) , (";", TS ";" 25) , ("<", TS "<" 26) , ("<<", TS "<<" 27) , ("<<=", TS "<<=" 28) , ("<=", TS "<=" 29) , ("=", TS "=" 30) , ("==", TS "==" 31) , (">", TS ">" 32) , (">=", TS ">=" 33) , (">>", TS ">>" 34) , (">>=", TS ">>=" 35) , ("?", TS "?" 36) , ("Typedef_name", TS "Typedef_name" 37) , ("[", TS "[" 38) , ("]", TS "]" 39) , ("^", TS "^" 40) , ("^=", TS "^=" 41) , ("auto", TS "auto" 42) , ("break", TS "break" 43) , ("case", TS "case" 44) , ("char", TS "char" 45) , ("const", TS "const" 46) , ("continue", TS "continue" 47) , ("default", TS "default" 48) , ("do", TS "do" 49) , ("double", TS "double" 50) , ("else", TS "else" 51) , ("enum", TS "enum" 52) , ("extern", TS "extern" 53) , ("float", TS "float" 54) , ("for", TS "for" 55) , ("goto", TS "goto" 56) , ("if", TS "if" 57) , ("int", TS "int" 58) , ("long", TS "long" 59) , ("register", TS "register" 60) , ("return", TS "return" 61) , ("short", TS "short" 62) , ("signed", TS "signed" 63) , ("sizeof", TS "sizeof" 64) , ("static", TS "static" 65) , ("struct", TS "struct" 66) , ("switch", TS "switch" 67) , ("typedef", TS "typedef" 68) , ("union", TS "union" 69) , ("unsigned", TS "unsigned" 70) , ("void", TS "void" 71) , ("volatile", TS "volatile" 72) , ("while", TS "while" 73) , ("{", TS "{" 74) , ("|", TS "|" 75) , ("|=", TS "|=" 76) , ("||", TS "||" 77) , ("}", TS "}" 78) , ("~", TS "~" 79) ] -- | Unquote string literal. unescapeInitTail :: String -> String unescapeInitTail = id . unesc . tail . id where unesc s = case s of '\\':c:cs | elem c ['\"', '\\', '\''] -> c : unesc cs '\\':'n':cs -> '\n' : unesc cs '\\':'t':cs -> '\t' : unesc cs '\\':'r':cs -> '\r' : unesc cs '\\':'f':cs -> '\f' : unesc cs '"':[] -> [] c:cs -> c : unesc cs _ -> [] ------------------------------------------------------------------- -- Alex wrapper code. -- A modified "posn" wrapper. ------------------------------------------------------------------- data Posn = Pn !Int !Int !Int deriving (Eq, Show, Ord) alexStartPos :: Posn alexStartPos = Pn 0 1 1 alexMove :: Posn -> Char -> Posn alexMove (Pn a l c) '\t' = Pn (a+1) l (((c+7) `div` 8)*8+1) alexMove (Pn a l c) '\n' = Pn (a+1) (l+1) 1 alexMove (Pn a l c) _ = Pn (a+1) l (c+1) type Byte = Word8 type AlexInput = ( Posn -- current position , Char -- previous char , [Byte] -- pending bytes on the current char , String ) -- current input string tokens :: String -> [Token] tokens str = go (alexStartPos, '\n', [], str) where go :: AlexInput -> [Token] go inp@(pos, _, _, str) = case alexScan inp 0 of AlexEOF -> [] AlexError (pos, _, _, _) -> [Err pos] AlexSkip inp' len -> go inp' AlexToken inp' len act -> act pos ( take len str) : (go inp') alexGetByte :: AlexInput -> Maybe (Byte,AlexInput) alexGetByte (p, c, (b:bs), s) = Just (b, (p, c, bs, s)) alexGetByte (p, _, [], s) = case s of [] -> Nothing (c:s) -> let p' = alexMove p c (b:bs) = utf8Encode c in p' `seq` Just (b, (p', c, bs, s)) alexInputPrevChar :: AlexInput -> Char alexInputPrevChar (p, c, bs, s) = c -- | Encode a Haskell String to a list of Word8 values, in UTF8 format. utf8Encode :: Char -> [Word8] utf8Encode = map fromIntegral . go . ord where go oc | oc <= 0x7f = [oc] | oc <= 0x7ff = [ 0xc0 + (oc `Data.Bits.shiftR` 6) , 0x80 + oc Data.Bits..&. 0x3f ] | oc <= 0xffff = [ 0xe0 + (oc `Data.Bits.shiftR` 12) , 0x80 + ((oc `Data.Bits.shiftR` 6) Data.Bits..&. 0x3f) , 0x80 + oc Data.Bits..&. 0x3f ] | otherwise = [ 0xf0 + (oc `Data.Bits.shiftR` 18) , 0x80 + ((oc `Data.Bits.shiftR` 12) Data.Bits..&. 0x3f) , 0x80 + ((oc `Data.Bits.shiftR` 6) Data.Bits..&. 0x3f) , 0x80 + oc Data.Bits..&. 0x3f ] }