{-# LANGUAGE LambdaCase #-} {-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TupleSections #-} {- BNF Converter: C flex generator Copyright (C) 2004 Author: Michael Pellauer Copyright (C) 2020 Andreas Abel Description : This module generates the Flex file. It is similar to JLex but with a few peculiarities. Author : Michael Pellauer Created : 5 August, 2003 -} module BNFC.Backend.C.CFtoFlexC ( cf2flex , ParserMode(..), parserName, parserPackage, cParser, stlParser, parserHExt , preludeForBuffer -- C code defining a buffer for lexing string literals. , cMacros -- Lexer definitions. , commentStates -- Stream of names for lexer states for comments. , lexComments -- Lexing rules for comments. , lexStrings -- Lexing rules for string literals. , lexChars -- Lexing rules for character literals. ) where import Prelude hiding ( (<>) ) import Data.Bifunctor ( first ) import Data.Char ( isAlphaNum, isAscii ) import Data.List ( isInfixOf ) import Data.Maybe ( fromMaybe, maybeToList ) import qualified Data.Map as Map import System.FilePath ( (<.>) ) import BNFC.CF import BNFC.Backend.C.Common ( posixC ) import BNFC.Backend.C.RegToFlex import BNFC.Backend.Common.NamedVariables import BNFC.Options ( InPackage ) import BNFC.PrettyPrint import BNFC.Utils ( cstring, symbolToName, unless, when ) data ParserMode = CParser Bool String -- ^ @C@ (@False@) or @C++ no STL@ (@True@) mode, with @name@ to use as prefix. | CppParser InPackage String -- ^ @C++@ mode, with optional package name parserName :: ParserMode -> String parserName = \case CParser _ n -> n CppParser p n -> fromMaybe n p parserPackage :: ParserMode -> InPackage parserPackage = \case CParser _ _ -> Nothing CppParser p _ -> p cParser :: ParserMode -> Bool cParser = \case CParser b _ -> not b CppParser _ _ -> False stlParser :: ParserMode -> Bool stlParser = \case CParser _ _ -> False CppParser _ _ -> True parserHExt :: ParserMode -> String parserHExt = \case CParser b _ -> if b then "H" else "h" CppParser _ _ -> "H" -- | Entrypoint. cf2flex :: ParserMode -> CF -> (String, SymMap) -- The environment is reused by the parser. cf2flex mode cf = (, env) $ unlines [ prelude stringLiterals mode , cMacros cf , lexSymbols env1 , restOfFlex (parserPackage mode) cf env , footer -- mode ] where env = Map.fromList env2 env0 = makeSymEnv (cfgSymbols cf) [0 :: Int ..] env1 = env0 ++ makeKwEnv (reservedWords cf) [length env0 ..] env2 = map (first Keyword) env1 ++ map (\ x -> (Tokentype x, "T_" ++ x)) (tokenNames cf) makeSymEnv = zipWith $ \ s n -> (s, '_' : fromMaybe ("SYMB_" ++ show n) (symbolToName s)) makeKwEnv = zipWith $ \ s n -> (s, "_KW_" ++ if all (\ c -> isAlphaNum c && isAscii c) s then s else show n) stringLiterals = isUsedCat cf (TokenCat catString) prelude :: Bool -> ParserMode -> String prelude stringLiterals mode = unlines $ concat [ [ "/* Lexer definition for use with FLex */" , "" -- noinput and nounput are most often unused -- https://stackoverflow.com/questions/39075510/option-noinput-nounput-what-are-they-for , "%option noyywrap noinput nounput" , "%option reentrant bison-bridge bison-locations" , "" ] , when stringLiterals [ "/* Additional data for the lexer: a buffer for lexing string literals. */" , "%option extra-type=\"Buffer\"" , "" ] , maybeToList $ ("%option prefix=\"" ++) . (++ "\"" ) <$> parserPackage mode , when (cParser mode) $ concat -- The following #define needs to be at the top before the automatic #include [ [ "%top{" ] , posixC , [ "}" ] ] , [ "%{" , "#include \"" ++ ("Absyn" <.> h) ++ "\"" , "#include \"" ++ ("Bison" <.> h) ++ "\"" , "" ] , [ "#define initialize_lexer " ++ parserName mode ++ "_initialize_lexer" , "" ] , when stringLiterals $ preludeForBuffer $ "Buffer" <.> h -- https://www.gnu.org/software/bison/manual/html_node/Token-Locations.html -- Flex is responsible for keeping tracking of the yylloc for Bison. -- Flex also doesn't do this automatically so we need this function -- https://stackoverflow.com/a/22125500/425756 , [ "static void update_loc(YYLTYPE* loc, char* text)" , "{" , " loc->first_line = loc->last_line;" , " loc->first_column = loc->last_column;" , " int i = 0;" -- put this here as @for (int i...)@ is only allowed in C99 , " for (; text[i] != '\\0'; ++i) {" , " if (text[i] == '\\n') {" -- Checking for \n is good enough to also support \r\n (but not \r) , " ++loc->last_line;" , " loc->last_column = 0; " , " } else {" , " ++loc->last_column; " , " }" , " }" , "}" , "#define YY_USER_ACTION update_loc(yylloc, yytext);" , "" , "%}" ] ] where h = parserHExt mode -- | Part of the lexer prelude needed when string literals are to be lexed. -- Defines an interface to the Buffer. preludeForBuffer :: String -> [String] preludeForBuffer bufferH = [ "/* BEGIN extensible string buffer */" , "" , "#include \"" ++ bufferH ++ "\"" , "" , "/* The initial size of the buffer to lex string literals. */" , "#define LITERAL_BUFFER_INITIAL_SIZE 1024" , "" , "/* The pointer to the literal buffer. */" , "#define literal_buffer yyextra" , "" , "/* Initialize the literal buffer. */" , "#define LITERAL_BUFFER_CREATE() literal_buffer = newBuffer(LITERAL_BUFFER_INITIAL_SIZE)" , "" , "/* Append characters at the end of the buffer. */" , "#define LITERAL_BUFFER_APPEND(s) bufferAppendString(literal_buffer, s)" , "" , "/* Append a character at the end of the buffer. */" , "#define LITERAL_BUFFER_APPEND_CHAR(c) bufferAppendChar(literal_buffer, c)" , "" , "/* Release the buffer, returning a pointer to its content. */" , "#define LITERAL_BUFFER_HARVEST() releaseBuffer(literal_buffer)" , "" , "/* In exceptional cases, e.g. when reaching EOF, we have to free the buffer. */" , "#define LITERAL_BUFFER_FREE() freeBuffer(literal_buffer)" , "" , "/* END extensible string buffer */" , "" ] -- For now all categories are included. -- Optimally only the ones that are used should be generated. cMacros :: CF -> String cMacros cf = unlines [ "LETTER [a-zA-Z]" , "CAPITAL [A-Z]" , "SMALL [a-z]" , "DIGIT [0-9]" , "IDENT [a-zA-Z0-9'_]" , unwords $ concat [ [ "%START CHAR CHARESC CHAREND STRING ESCAPED" ] , take (numberOfBlockCommentForms cf) commentStates ] , "" , "%% /* Rules. */" ] lexSymbols :: KeywordEnv -> String lexSymbols ss = concatMap transSym ss where transSym (s,r) = "\"" ++ s' ++ "\" \t return " ++ r ++ ";\n" where s' = escapeChars s restOfFlex :: InPackage -> CF -> SymMap -> String restOfFlex _inPackage cf env = unlines $ concat [ [ render $ lexComments $ comments cf , "" ] , userDefTokens , ifC catString $ lexStrings "yylval" "_STRING_" "_ERROR_" , ifC catChar $ lexChars "yylval" "_CHAR_" , ifC catDouble [ "{DIGIT}+\".\"{DIGIT}+(\"e\"(\\-)?{DIGIT}+)? \t yylval->_double = atof(yytext); return _DOUBLE_;" ] , ifC catInteger [ "{DIGIT}+ \t yylval->_int = atoi(yytext); return _INTEGER_;" ] , ifC catIdent [ "{LETTER}{IDENT}* \t yylval->_string = strdup(yytext); return _IDENT_;" ] , [ "[ \\t\\r\\n\\f] \t /* ignore white space. */;" , ". \t return _ERROR_;" , "" , "%% /* Initialization code. */" ] ] where ifC cat s = if isUsedCat cf (TokenCat cat) then s else [] userDefTokens = [ "" ++ printRegFlex exp ++ " \t yylval->_string = strdup(yytext); return " ++ sName name ++ ";" | (name, exp) <- tokenPragmas cf ] where sName n = fromMaybe n $ Map.lookup (Tokentype n) env footer :: String footer = unlines [ "yyscan_t initialize_lexer(FILE *inp)" , "{" , " yyscan_t scanner;" , " if (yylex_init_extra(NULL, &scanner)) return 0;" , " if (inp) yyrestart(inp, scanner);" , " return scanner;" , "}" ] -- | Lexing of strings, converting escaped characters. lexStrings :: String -> String -> String -> [String] lexStrings yylval stringToken errorToken = [ "\"\\\"\" \t LITERAL_BUFFER_CREATE(); BEGIN STRING;" , "\\\\ \t BEGIN ESCAPED;" , "\\\" \t " ++ yylval ++ "->_string = LITERAL_BUFFER_HARVEST(); BEGIN INITIAL; return " ++ stringToken ++ ";" , ". \t LITERAL_BUFFER_APPEND_CHAR(yytext[0]);" , "f \t LITERAL_BUFFER_APPEND_CHAR('\\f'); BEGIN STRING;" , "n \t LITERAL_BUFFER_APPEND_CHAR('\\n'); BEGIN STRING;" , "r \t LITERAL_BUFFER_APPEND_CHAR('\\r'); BEGIN STRING;" , "t \t LITERAL_BUFFER_APPEND_CHAR('\\t'); BEGIN STRING;" , "\\\" \t LITERAL_BUFFER_APPEND_CHAR('\"'); BEGIN STRING;" , "\\\\ \t LITERAL_BUFFER_APPEND_CHAR('\\\\'); BEGIN STRING;" , ". \t LITERAL_BUFFER_APPEND(yytext); BEGIN STRING;" , "<>\t LITERAL_BUFFER_FREE(); return " ++ errorToken ++ ";" ] -- | Lexing of characters, converting escaped characters. lexChars :: String -> String -> [String] lexChars yylval charToken = [ "\"'\" \tBEGIN CHAR;" , "\\\\ \t BEGIN CHARESC;" , "[^'] \t BEGIN CHAREND; " ++ yylval ++ "->_char = yytext[0]; return " ++ charToken ++ ";" , "f \t BEGIN CHAREND; " ++ yylval ++ "->_char = '\\f'; return " ++ charToken ++ ";" , "n \t BEGIN CHAREND; " ++ yylval ++ "->_char = '\\n'; return " ++ charToken ++ ";" , "r \t BEGIN CHAREND; " ++ yylval ++ "->_char = '\\r'; return " ++ charToken ++ ";" , "t \t BEGIN CHAREND; " ++ yylval ++ "->_char = '\\t'; return " ++ charToken ++ ";" , ". \t BEGIN CHAREND; " ++ yylval ++ "->_char = yytext[0]; return " ++ charToken ++ ";" , "\"'\" \t BEGIN INITIAL;" ] -- --------------------------------------------------------------------------- -- Comments -- | Create flex rules for single-line and multi-lines comments. -- The first argument is an optional namespace (for C++); the second -- argument is the set of comment delimiters as returned by BNFC.CF.comments. -- -- This function is only compiling the results of applying either -- lexSingleComment or lexMultiComment on each comment delimiter or pair of -- delimiters. -- -- >>> lexComments ([("{-","-}")],["--"]) -- "--"[^\n]* /* skip */; /* BNFC: comment "--" */ -- "{-" BEGIN COMMENT; /* BNFC: block comment "{-" "-}" */ -- "-}" BEGIN INITIAL; -- . /* skip */; -- [\n] /* skip */; lexComments :: ([(String, String)], [String]) -> Doc lexComments (m,s) = vcat $ concat [ map lexSingleComment s , zipWith lexMultiComment m commentStates ] -- | If we have several block comments, we need different COMMENT lexing states. commentStates :: [String] commentStates = map ("COMMENT" ++) $ "" : map show [1..] -- | Create a lexer rule for single-line comments. -- The first argument is -- an optional c++ namespace -- The second argument is the delimiter that marks the beginning of the -- comment. -- -- >>> lexSingleComment "--" -- "--"[^\n]* /* skip */; /* BNFC: comment "--" */ -- -- >>> lexSingleComment "\"" -- "\""[^\n]* /* skip */; /* BNFC: comment "\"" */ lexSingleComment :: String -> Doc lexSingleComment c = "" <> cstring c <> "[^\\n]*" <+> "/* skip */;" <+> unless (containsCCommentMarker c) ("/* BNFC: comment" <+> cstring c <+> "*/") containsCCommentMarker :: String -> Bool containsCCommentMarker s = "/*" `isInfixOf` s || "*/" `isInfixOf` s -- | Create a lexer rule for multi-lines comments. -- The first argument is -- an optional c++ namespace -- The second arguments is the pair of delimiter for the multi-lines comment: -- start deleminiter and end delimiter. -- There might be a possible bug here if a language includes 2 multi-line -- comments. They could possibly start a comment with one character and end it -- with another. However this seems rare. -- -- >>> lexMultiComment ("{-", "-}") "COMMENT" -- "{-" BEGIN COMMENT; /* BNFC: block comment "{-" "-}" */ -- "-}" BEGIN INITIAL; -- . /* skip */; -- [\n] /* skip */; -- -- >>> lexMultiComment ("\"'", "'\"") "COMMENT" -- "\"'" BEGIN COMMENT; /* BNFC: block comment "\"'" "'\"" */ -- "'\"" BEGIN INITIAL; -- . /* skip */; -- [\n] /* skip */; lexMultiComment :: (String, String) -> String -> Doc lexMultiComment (b,e) comment = vcat [ "" <> cstring b <+> "BEGIN" <+> text comment <> ";" <+> unless (containsCCommentMarker b || containsCCommentMarker e) ("/* BNFC: block comment" <+> cstring b <+> cstring e <+> "*/") , commentTag <> cstring e <+> "BEGIN INITIAL;" , commentTag <> ". /* skip */;" , commentTag <> "[\\n] /* skip */;" ] where commentTag = text $ "<" ++ comment ++ ">" -- | Helper function that escapes characters in strings. escapeChars :: String -> String escapeChars [] = [] escapeChars ('\\':xs) = '\\' : ('\\' : (escapeChars xs)) escapeChars ('\"':xs) = '\\' : ('\"' : (escapeChars xs)) escapeChars (x:xs) = x : (escapeChars xs)