{-# LANGUAGE OverloadedStrings #-} module BNFC.Backend.Haskell.Lexer where import BNFC.Prelude import Control.Monad.State import Data.List (intersperse) import qualified Data.Map as Map import Data.String (fromString) import Prettyprinter import System.FilePath (takeBaseName) import BNFC.Backend.Common.StringUtils (escapeChars) import BNFC.Backend.Common.Utils as Utils import BNFC.Backend.CommonInterface.Backend import BNFC.Backend.Haskell.Layout import BNFC.Backend.Haskell.Options import BNFC.Backend.Haskell.State import BNFC.Backend.Haskell.Utilities.Lexer import BNFC.Backend.Haskell.Utilities.Utils import BNFC.CF import BNFC.Lexing import BNFC.Options.GlobalOptions import BNFC.Types.Position import BNFC.Types.Regex import qualified BNFC.Utils.List2 as List2 haskellLexer :: LBNF -> State HaskellBackendState Result haskellLexer lbnf = do st <- get layout <- haskellLayout lbnf let cfName = takeBaseName $ optInput $ globalOpt st inDirectory = inDir $ haskellOpts st nSpace = nameSpace $ haskellOpts st tt = tokenText $ haskellOpts st toks = lexerParserTokens st lexerSpecification = cf2lexer lbnf cfName inDirectory nSpace tt toks return $ if layoutsAreUsed lbnf then (mkFilePath inDirectory nSpace cfName "Lex" "x", lexerSpecification) : layout else [(mkFilePath inDirectory nSpace cfName "Lex" "x", lexerSpecification)] cf2lexer :: LBNF -> String -> Bool -> Maybe String -> TokenText -> [Token] -> String cf2lexer lbnf name inDir nameSpace tokenText toks = docToString defaultLayoutOptions $ cf2doc lbnf name inDir nameSpace tokenText toks cf2doc :: LBNF -> String -> Bool -> Maybe String -> TokenText -> [Token] -> Doc () cf2doc lbnf cfName inDir nameSpace tokenText toks = (vsep . intersperse emptyDoc) [ prelude cfName inDir nameSpace tokenText , cMacros , rMacros lbnf , restOfAlex tokenText toks lbnf ] prelude :: String -> Bool -> Maybe String -> TokenText -> Doc () prelude cfName inDir nameSpace tokenText = vsep $ [ "-- File generated by the BNF Converter." , emptyDoc , "-- -*- haskell -*-" , "-- Lexer definition for use with Alex 3." , lbrace , "{-# OPTIONS -fno-warn-incomplete-patterns #-}" , "{-# OPTIONS_GHC -w #-}" , emptyDoc , "{-# LANGUAGE PatternSynonyms #-}" , emptyDoc , "module" <+> fromString (mkModule inDir nameSpace cfName "Lex") <+> "where" , emptyDoc , "import Prelude" , emptyDoc ] ++ Utils.when (tokenText /= StringToken) [ tokenTextImport tokenText, emptyDoc ] ++ [ "import qualified Data.Bits" , "import Data.Char (ord)" , "import Data.Function (on)" , "import Data.Maybe (fromMaybe)" , "import qualified Data.Map as Map" , "import Data.Map (Map)" , "import Data.Word (Word8)" , rbrace ] -- | Character class definitions. cMacros :: Doc () cMacros = vsep [ "-- Predefined character classes" , emptyDoc , "$c = [A-Z\\192-\\221] # [\\215] -- capital isolatin1 letter (215 = \\times)" , "$s = [a-z\\222-\\255] # [\\247] -- small isolatin1 letter (247 = \\div )" , "$l = [$c $s] -- letter" , "$d = [0-9] -- digit" , "$i = [$l $d _ '] -- identifier character" , "$u = [. \\n] -- universal: any character" ] -- | Regular expressions and lex actions. rMacros :: LBNF -> Doc () rMacros lbnf = vsep $ Utils.unless (Map.null (_lbnfSymbols lbnf)) [ "-- Symbols and non-identifier-like reserved words" , emptyDoc , "@rsyms =" <+> (hsep . intersperse pipe) (fromString . unwords . esc <$> symbs) ] where symbs :: [String] symbs = unicodeAndSymbols lbnf esc :: String -> [String] esc s = if null a then rest else show a : rest where (a, r) = span (\ c -> isAscii c && isAlphaNum c) s rest = case r of [] -> [] c : xs -> (if isPrint c then ['\\',c] else '\\' : show (ord c)) : esc xs -- rest af Alex. restOfAlex :: TokenText -> [Token] -> LBNF -> Doc () restOfAlex tokenText toks lbnf = vsep [ ":-" , vsep $ concat [ [ emptyDoc ] -- Line comments. , Utils.when (not (null (_lbnfLineComments lbnf))) [ vsep $ lineComment . snd <$> Map.toList (_lbnfLineComments lbnf) , emptyDoc ] -- Block comments. , Utils.when (not (null (_lbnfBlockComments lbnf))) [ vsep $ blockComment . snd <$> Map.toList (_lbnfBlockComments lbnf) , emptyDoc ] ] , "-- Whitespace (skipped)" , "$white+ ;" , vsep $ concat [ [ emptyDoc ] , Utils.unless (null (unicodeAndSymbols lbnf)) [ "-- Symbols" , "@rsyms" , indent 4 "{ tok (eitherResIdent TV) }" , emptyDoc ] , Utils.when (not (null lbnfTokens)) [ userDefTokenTypes , emptyDoc ] , [ "-- Keywords and Ident" , "$l $i*" , indent 4 "{ tok (eitherResIdent TV) }" , emptyDoc ] , Utils.when (BString `elem` usedBuiltins) [ "-- String" , "\\\" ([$u # [\\\" \\\\ \\n]] | (\\\\ (\\\" | \\\\ | \\' | n | t | r | f)))* \\\"" , indent 4 "{ tok (TL . unescapeInitTail) }" , emptyDoc ] , Utils.when (BChar `elem` usedBuiltins) [ "-- Char" , "\\\' ($u # [\\\' \\\\] | \\\\ [\\\\ \\\' n t r f]) \\'" , indent 4 "{ tok TC }" , emptyDoc ] , Utils.when (BInteger `elem` usedBuiltins) [ "-- Integer" , "$d+" , indent 4 "{ tok TI }" , emptyDoc ] , Utils.when (BDouble `elem` usedBuiltins) [ "-- Double" , "$d+ \\. $d+ (e (\\-)? $d+)?" , indent 4 "{ tok TD }" , emptyDoc ] ] , lbrace , "-- | Create a token with position." , "tok :: (" <> stringType <> " -> Tok) -> (Posn -> " <> stringType <> " -> Token)" , "tok f p = PT p . f" , emptyDoc , "-- | Token without position." , tokDataTypes tokenText toks , emptyDoc , "-- | Smart constructor for 'Tok' for the sake of backwards compatibility." , "pattern TS :: " <> stringType <> " -> Int -> Tok" , "pattern TS t i = TK (TokSymbol t i)" , emptyDoc , "-- | Keyword or symbol tokens have a unique ID." , "data TokSymbol = TokSymbol" , indent 2 $ vsep [ "{ tsText :: " <> stringType , indent 4 "-- ^ Keyword or symbol text." , ", tsID :: !Int" , indent 4 "-- ^ Unique ID." , "} deriving (Show)" ] , emptyDoc , "-- | Keyword/symbol equality is determined by the unique ID." , "instance Eq TokSymbol where (==) = (==) `on` tsID" , emptyDoc , "-- | Keyword/symbol ordering is determined by the unique ID." , "instance Ord TokSymbol where compare = compare `on` tsID" , emptyDoc , "-- | Token with position." , "data Token" , " = PT Posn Tok" , " | Err Posn" , " deriving (Eq, Show, Ord)" , emptyDoc , "-- | Pretty print a position." , "printPosn :: Posn -> String" , "printPosn (Pn _ l c) = \"line \" ++ show l ++ \", column \" ++ show c" , emptyDoc , "-- | Pretty print the position of the first token in the list." , "tokenPos :: [Token] -> String" , "tokenPos (t:_) = printPosn (tokenPosn t)" , "tokenPos [] = \"end of file\"" , emptyDoc , "-- | Get the position of a token." , "tokenPosn :: Token -> Posn" , "tokenPosn (PT p _) = p" , "tokenPosn (Err p) = p" , emptyDoc , "-- | Get line and column of a token." , "tokenLineCol :: Token -> (Int, Int)" , "tokenLineCol = posLineCol . tokenPosn" , emptyDoc , "-- | Get line and column of a position." , "posLineCol :: Posn -> (Int, Int)" , "posLineCol (Pn _ l c) = (l,c)" , emptyDoc , "-- | Convert a token into \"position token\" form." , "mkPosToken :: Token -> ((Int, Int), " <> stringType <> ")" , "mkPosToken t = (tokenLineCol t, tokenText t)" , emptyDoc , tokenTextfunction , emptyDoc , "-- | Convert a token to a string." , "prToken :: Token -> String" , "prToken t =" <+> applyP stringUnpack "tokenText t" , emptyDoc , "-- | Convert potential keyword into token or use fallback conversion." , "eitherResIdent :: (" <> stringType <+> "-> Tok) ->" <+> stringType <+> "-> Tok" , "eitherResIdent tv s = fromMaybe (tv s) (Map.lookup s resWords)" , emptyDoc , "-- | The keywords and symbols of the language organized as a Map." , if isStringToken tokenText then "resWords :: Map String Tok" else "resWords :: Map Data.Text.Text Tok" , "resWords = Map.fromAscList" , indent 2 $ vsep $ zipWith (<+>) ("[" : repeat ",") tokenTuples , indent 2 rbracket , emptyDoc , "-- | Unquote string literal." , "unescapeInitTail ::" <+> stringType <+> "->" <+> stringType , "unescapeInitTail =" <+> fromString stringPack <+> ". unesc . tail . " <> fromString stringUnpack , " where" , " unesc s = case s of" , " '\\\\':c:cs | elem c ['\\\"', '\\\\', '\\\''] -> c : unesc cs" , " '\\\\':'n':cs -> '\\n' : unesc cs" , " '\\\\':'t':cs -> '\\t' : unesc cs" , " '\\\\':'r':cs -> '\\r' : unesc cs" , " '\\\\':'f':cs -> '\\f' : unesc cs" , " '\"':[] -> []" , " c:cs -> c : unesc cs" , " _ -> []" , emptyDoc , "-------------------------------------------------------------------" , "-- Alex wrapper code." , "-- A modified \"posn\" wrapper." , "-------------------------------------------------------------------" , emptyDoc , "data Posn = Pn !Int !Int !Int" , " deriving (Eq, Show, Ord)" , emptyDoc , "alexStartPos :: Posn" , "alexStartPos = Pn 0 1 1" , emptyDoc , "alexMove :: Posn -> Char -> Posn" , "alexMove (Pn a l c) '\\t' = Pn (a+1) l (((c+7) `div` 8)*8+1)" , "alexMove (Pn a l c) '\\n' = Pn (a+1) (l+1) 1" , "alexMove (Pn a l c) _ = Pn (a+1) l (c+1)" , emptyDoc , "type Byte = Word8" , emptyDoc , "type AlexInput =" , indent 2 $ vsep [ "( Posn -- current position" , ", Char -- previous char" , ", [Byte] -- pending bytes on the current char" , comma <+> stringType <+> ") -- current input string" ] , emptyDoc , "tokens ::" <+> stringType <+> "-> [Token]" , "tokens str = go (alexStartPos, '\\n', [], str)" , indent 4 $ vsep [ "where" , indent 2 $ vsep [ "go :: AlexInput -> [Token]" , "go inp@(pos, _, _, str) =" , indent 2 $ vsep [ "case alexScan inp 0 of" , indent 2 $ vsep [ "AlexEOF -> []" , "AlexError (pos, _, _, _) -> [Err pos]" , "AlexSkip inp' len -> go inp'" , "AlexToken inp' len act -> act pos (" <+> stringTake <+> "len str) : (go inp')" ] ] ] ] , emptyDoc , "alexGetByte :: AlexInput -> Maybe (Byte,AlexInput)" , "alexGetByte (p, c, (b:bs), s) = Just (b, (p, c, bs, s))" , "alexGetByte (p, _, [], s) =" , indent 2 $ vsep [ "case" <+> apply stringUncons "s" <+> " of" , indent 2 $ vsep [ stringNilP <+> "-> Nothing" , stringConsP <+> "->" , indent 2 $ vsep [ "let p' = alexMove p c" , indent 4 "(b:bs) = utf8Encode c" , "in p' `seq` Just (b, (p', c, bs, s))" ] ] ] , emptyDoc , "alexInputPrevChar :: AlexInput -> Char" , "alexInputPrevChar (p, c, bs, s) = c" , emptyDoc , "-- | Encode a Haskell String to a list of Word8 values, in UTF8 format." , "utf8Encode :: Char -> [Word8]" , "utf8Encode = map fromIntegral . go . ord" , indent 2 "where" , indent 2 "go oc" , indent 3 "| oc <= 0x7f = [oc]" , emptyDoc , indent 3 "| oc <= 0x7ff = [ 0xc0 + (oc `Data.Bits.shiftR` 6)" , indent 24 ", 0x80 + oc Data.Bits..&. 0x3f" , indent 24 "]" , emptyDoc , indent 3 "| oc <= 0xffff = [ 0xe0 + (oc `Data.Bits.shiftR` 12)" , indent 24 ", 0x80 + ((oc `Data.Bits.shiftR` 6) Data.Bits..&. 0x3f)" , indent 24 ", 0x80 + oc Data.Bits..&. 0x3f" , indent 24 "]" , indent 3 "| otherwise = [ 0xf0 + (oc `Data.Bits.shiftR` 18)" , indent 24 ", 0x80 + ((oc `Data.Bits.shiftR` 12) Data.Bits..&. 0x3f)" , indent 24 ", 0x80 + ((oc `Data.Bits.shiftR` 6) Data.Bits..&. 0x3f)" , indent 24 ", 0x80 + oc Data.Bits..&. 0x3f" , indent 24 "]" , rbrace ] where symbolsKeywords :: [(Doc (), Int)] symbolsKeywords = fmap (\(s,i) -> (fromString (escapeChars $ toList s), i)) (Map.toList $ _lbnfSymbolsKeywords lbnf) tokenTuples :: [Doc ()] tokenTuples = map toTokTuple symbolsKeywords -- tuple containing resersed word and corresponding token. toTokTuple :: (Doc (), Int) -> Doc () toTokTuple (n, i) = if isStringToken tokenText then tupled [dquotes n, "TS" <+> dquotes n <+> fromString (show i)] else tupled [ "Data.Text.pack" <+> dquotes n , "TS" <+> parens ( "Data.Text.pack" <+> dquotes n) <+> fromString (show i) ] lineComment :: LineComment -> Doc () lineComment (LineComment s) = vsep [ "-- Line comment" <+> s' , s' <+> "[.]*" <+> semi ] where s' :: Doc () s' = dquotes (fromString (toList s)) blockComment :: BlockComment -> Doc () blockComment (BlockComment s1 s2) = vsep [ "-- Block comment" <+> dquotes (fromString (toList s1)) <+> dquotes (fromString (toList s2)) , printRegAlex (mkRegMultilineComment (toList s1) (toList s2)) <> semi ] lbnfTokens :: TokenDefs lbnfTokens = if hasIdentifier $ _lbnfTokenDefs lbnf then Map.delete ('I' :| "dent") (_lbnfTokenDefs lbnf) else _lbnfTokenDefs lbnf userDefTokenTypes :: Doc () userDefTokenTypes = (vsep . intersperse emptyDoc) $ userDefTokenType . (\(a,b) -> (a, (regexToken . wpThing) b)) <$> Map.toList lbnfTokens userDefTokenType :: (CatName, Regex) -> Doc () userDefTokenType (name, regex) = vsep [ "-- token" <+> fromString (toList name) , printRegAlex regex , indent 4 $ braces $ space <> "tok" <+> lparen <> "eitherResIdent T_" <> fromString (toList name) <> rparen <> space ] usedBuiltins:: [BuiltinCat] usedBuiltins = Map.keys $ _lbnfParserBuiltins lbnf tokDataTypes :: TokenText -> [Token] -> Doc () tokDataTypes tt tokens = if isStringToken tt then vsep $ [ "data Tok" , indent 2 "= TK {-# UNPACK #-} !TokSymbol -- ^ Reserved word or symbol." , indent 2 "| TL !String -- ^ String literal." , indent 2 "| TI !String -- ^ Integer literal." , indent 2 "| TV !String -- ^ Identifier." , indent 2 "| TD !String -- ^ Float literal." , indent 2 "| TC !String -- ^ Character literal." ] ++ (indent 2 . tokDataType <$> filter isUserDefined tokens) ++ [space <> space <> "deriving (Eq, Show, Ord)"] else vsep $ [ "data Tok" , indent 2 "= TK {-# UNPACK #-} !TokSymbol -- ^ Reserved word or symbol." , indent 2 "| TL !Data.Text.Text -- ^ String literal." , indent 2 "| TI !Data.Text.Text -- ^ Integer literal." , indent 2 "| TV !Data.Text.Text -- ^ Identifier." , indent 2 "| TD !Data.Text.Text -- ^ Float literal." , indent 2 "| TC !Data.Text.Text -- ^ Character literal." ] ++ (indent 2 . tokDataType <$> filter isUserDefined tokens) ++ [space <> space <> "deriving (Eq, Show, Ord)"] tokDataType :: Token -> Doc () tokDataType token = vsep [ "|" <+> tokenName token <+> "!" <> stringType <> tokenComment token ] tokenTextfunction :: Doc () tokenTextfunction = vsep $ [ "-- | Convert a token to its text." , "tokenText :: Token -> " <> stringType , "tokenText t = case t of" , indent 2 $ vsep [ "PT _ (TS s _) -> s" , "PT _ (TL s) -> " <> applyP stringPack "show s" , "PT _ (TI s) -> s" , "PT _ (TV s) -> s" , "PT _ (TD s) -> s" , "PT _ (TC s) -> s" , "Err _ -> " <> apply stringPack "\"#error\"" ] ] ++ [ indent 2 "PT _ (" <> tokenName token <+> "s) -> s" | token <- filter isUserDefined toks ] (stringType, stringTake, stringUncons, stringPack, stringUnpack, stringNilP, stringConsP) = case tokenText of StringToken -> ("String", "take", "", "id", "id", "[]", "(c:s)" ) TextToken -> ("Data.Text.Text", "Data.Text.take", "Data.Text.uncons", "Data.Text.pack", "Data.Text.unpack", "Nothing", "Just (c,s)") apply :: String -> String -> Doc () apply "" s = fromString s apply "id" s = fromString s apply f s = fromString f <+> fromString s applyP :: String -> String -> Doc () applyP "" s = fromString s applyP "id" s = fromString s applyP f s = fromString f <+> lparen <> fromString s <> rparen ----------------------------------------------------------- -- Pretty printer for Regex. -- The top-level printing method. printRegAlex :: Regex -> Doc () printRegAlex = prt 0 -- the printer class does the job class Print a where prt :: Int -> a -> Doc () instance {-# OVERLAPPABLE #-} Print a => Print [a] where prt i as = hsep $ map (prt i) as instance Print Char where prt _ = \case '\n' -> "\\n" '\t' -> "\\t" '\r' -> "\\r" '\f' -> "\\f" c | isAlphaNum c -> fromString [c] c | isPrint c -> fromString $ '\\':[c] c -> fromString $ '\\':show (ord c) instance Print Regex where prt i e = case e of RChar (CMinus yes no) -> if isEmpty no then if onlyOneChar yes then prt 2 yes else lbracket <+> prt 2 yes <+> rbracket else lbracket <+> prt 2 yes <+> fromString "#" <+> prt 2 no <+> rbracket RAlts regs -> prPrec i 1 $ hsep $ intersperse "|" $ map (prt 1) $ List2.toList regs RMinus reg1 reg2 -> lbracket <+> prt 2 reg1 <+> fromString "#" <+> prt 2 reg2 <+> rbracket REps -> lparen <> rparen RSeqs regs -> prPrec i 2 $ prt 2 $ List2.toList regs RStar reg -> prPrec i 3 (prt 3 reg) <> fromString "*" RPlus reg -> prPrec i 3 (prt 3 reg) <> fromString "+" ROpt reg -> prPrec i 3 (prt 3 reg) <> fromString "?" instance Print CharClassUnion where prt i e = case e of CAny -> "$u" CAlt alts -> hsep $ map (prt i) alts instance Print CharClassAtom where prt _ e = case e of CChar c -> prt 0 c CDigit -> fromString "$d" CLower -> fromString "$s" CUpper -> fromString "$c"