{- BNF Converter: ocamllex Generator Copyright (C) 2005 Author: Kristofer Johannisson This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} -- based on BNFC Haskell backend module BNFC.Backend.OCaml.CFtoOCamlLex (cf2ocamllex) where import Control.Arrow ((&&&)) import Data.List import Data.Char import Text.PrettyPrint hiding (render) import qualified Text.PrettyPrint as PP import BNFC.CF import AbsBNF import BNFC.Backend.OCaml.CFtoOCamlYacc (terminal) import BNFC.Utils ((+++)) cf2ocamllex :: String -> String -> CF -> String cf2ocamllex _ parserMod cf = unlines $ intercalate [""] [ header parserMod cf, definitions cf, [PP.render (rules cf)] ] header :: String -> CF -> [String] header parserMod cf = [ "(* This ocamllex file was machine-generated by the BNF converter *)", "{", "open " ++ parserMod, "open Lexing", "", hashtables cf, "", "let unescapeInitTail (s:string) : string =", " let rec unesc s = match s with", " '\\\\'::c::cs when List.mem c ['\\\"'; '\\\\'; '\\\''] -> c :: unesc cs", " | '\\\\'::'n'::cs -> '\\n' :: unesc cs", " | '\\\\'::'t'::cs -> '\\t' :: unesc cs", " | '\\\"'::[] -> []", " | c::cs -> c :: unesc cs", " | _ -> []", " (* explode/implode from caml FAQ *)", " in let explode (s : string) : char list =", " let rec exp i l =", " if i < 0 then l else exp (i - 1) (s.[i] :: l) in", " exp (String.length s - 1) []", " in let implode (l : char list) : string =", " let res = String.create (List.length l) in", " let rec imp i = function", " | [] -> res", " | c :: l -> res.[i] <- c; imp (i + 1) l in", " imp 0 l", " in implode (unesc (List.tl (explode s)))", "", "let incr_lineno (lexbuf:Lexing.lexbuf) : unit =", " let pos = lexbuf.lex_curr_p in", " lexbuf.lex_curr_p <- { pos with", " pos_lnum = pos.pos_lnum + 1;", " pos_bol = pos.pos_cnum;", " }", "}" ] -- | set up hashtables for reserved symbols and words hashtables :: CF -> String hashtables cf = ht "symbol_table" (symbols cf ) ++ "\n" ++ ht "resword_table" (reservedWords cf) where ht _ syms | null syms = "" ht table syms = unlines [ "let" +++ table +++ "= Hashtbl.create " ++ show (length syms), "let _ = List.iter (fun (kwd, tok) -> Hashtbl.add" +++ table +++ "kwd tok)", " [" ++ concat (intersperse ";" keyvals) ++ "]" ] where keyvals = map (\(x,y) -> "(" ++ x ++ ", " ++ y ++ ")") (zip (map show syms) (map (terminal cf) syms)) definitions :: CF -> [String] definitions cf = concat [ cMacros, rMacros cf, uMacros cf ] cMacros :: [String] cMacros = [ "let l = ['a'-'z' 'A'-'Z' '\\192' - '\\255'] # ['\\215' '\\247'] (* isolatin1 letter FIXME *)", "let c = ['A'-'Z' '\\192'-'\\221'] # ['\\215'] (* capital isolatin1 letter FIXME *)", "let s = ['a'-'z' '\\222'-'\\255'] # ['\\247'] (* small isolatin1 letter FIXME *)", "let d = ['0'-'9'] (* digit *)", "let i = l | d | ['_' '\\''] (* identifier character *)", "let u = ['\\000'-'\\255'] (* universal: any character *)" ] rMacros :: CF -> [String] rMacros cf = let symbs = symbols cf in (if null symbs then [] else [ "let rsyms = (* reserved words consisting of special symbols *)", " " ++ unwords (intersperse "|" (map mkEsc symbs)) ]) where mkEsc s = "\"" ++ concat (map f s) ++ "\"" f x = if x `elem` ['"','\\'] then "\\" ++ [x] else [x] -- user macros, derived from the user-defined tokens uMacros :: CF -> [String] uMacros cf = ["let " ++ name ++ " = " ++ rep | (name, rep, _) <- userTokens cf] -- returns the tuple of (reg_name, reg_representation, token_name) userTokens :: CF -> [(String, String, String)] userTokens cf = let regName = map toLower . show in [(regName name, printRegOCaml reg, show name) | (name, reg) <- tokenPragmas cf] -- | Make OCamlLex rule -- >>> mkRule "token" [("REGEX1","ACTION1"),("REGEX2","ACTION2"),("...","...")] -- rule token = -- parse REGEX1 {ACTION1} -- | REGEX2 {ACTION2} -- | ... {...} -- -- If no regex are given, we dont create a lexer rule: -- >>> mkRule "empty" [] -- mkRule :: Doc -> [(Doc,Doc)] -> Doc mkRule _ [] = empty mkRule entrypoint (r1:rn) = vcat [ "rule" <+> entrypoint <+> "=" , nest 2 $ hang "parse" 4 $ vcat (nest 2 (mkOne r1):map (("|" <+>) . mkOne) rn) ] where mkOne (regex, action) = regex <+> braces action -- | Create regex for single line comments -- >>> mkRegexSingleLineComment "--" -- "--" (_ # '\n')* mkRegexSingleLineComment :: String -> Doc mkRegexSingleLineComment s = doubleQuotes (text s) <+> "(_ # '\\n')*" -- | Create regex for multiline comments -- >>> mkRegexMultilineComment "" -- "" mkRegexMultilineComment :: String -> String -> Doc mkRegexMultilineComment b e = lit b <+> parens ( hsep $ intersperse "|" subregexs ) <> "*" <+> lit [head e] <> "*" <+> lit e where lit :: String -> Doc lit "" = empty lit [c] = quotes (char c) lit s = doubleQuotes (text s) prefix = map (init &&& last) (drop 1 (inits e)) subregexs = [ lit ss <+> parens ("u #" <+> brackets (lit [s])) | (ss,s) <- prefix] -- | Uses the function from above to make a lexer rule from the CF grammar rules :: CF -> Doc rules cf = mkRule "token" $ -- comments [ (mkRegexSingleLineComment s, "token lexbuf") | s <- singleLineC ] ++ [ (mkRegexMultilineComment b e, "token lexbuf") | (b,e) <- multilineC] ++ -- user tokens [ (text n , tokenAction (text t)) | (n,_,t) <- userTokens cf] ++ -- predefined tokens [ ( "l i*", tokenAction "Ident" ) ] ++ [ ( "rsyms" , "let id = lexeme lexbuf in try Hashtbl.find symbol_table id with Not_found -> failwith (\"internal lexer error: reserved symbol \" ^ id ^ \" not found in hashtable\")" ) | not (null (symbols cf))] ++ -- integers [ ( "d+", "let i = lexeme lexbuf in TOK_Integer (int_of_string i)" ) -- doubles , ( "d+ '.' d+ ('e' ('-')? d+)?" , "let f = lexeme lexbuf in TOK_Double (float_of_string f)" ) -- strings , ( "'\\\"' ((u # ['\\\"' '\\\\' '\\n']) | ('\\\\' ('\\\"' | '\\\\' | '\\\'' | 'n' | 't')))* '\\\"'" , "let s = lexeme lexbuf in TOK_String (unescapeInitTail s)" ) -- chars , ( "'\\'' ((u # ['\\\'' '\\\\']) | ('\\\\' ('\\\\' | '\\\'' | 'n' | 't'))) '\\\''" , "let s = lexeme lexbuf in TOK_Char s.[1]") -- spaces , ( "[' ' '\\t']", "token lexbuf") -- new lines , ( "'\\n'", "incr_lineno lexbuf; token lexbuf" ) -- end of file , ( "eof", "TOK_EOF" ) ] where (multilineC, singleLineC) = comments cf tokenAction t = case reservedWords cf of [] -> "let l = lexeme lexbuf in TOK_" <> t <>" l" _ -> "let l = lexeme lexbuf in try Hashtbl.find resword_table l with Not_found -> TOK_" <> t <+> "l" ------------------------------------------------------------------- -- Modified from the inlined version of @RegToAlex@. ------------------------------------------------------------------- -- modified from pretty-printer generated by the BNF converter -- the top-level printing method printRegOCaml :: Reg -> String printRegOCaml = render . prt 0 -- you may want to change render and parenth render :: [String] -> String render = rend 0 where rend :: Int -> [String] -> String rend i ss = case ss of "[" :ts -> cons "[" $ rend i ts "(" :ts -> cons "(" $ rend i ts t : "," :ts -> cons t $ space "," $ rend i ts t : ")" :ts -> cons t $ cons ")" $ rend i ts t : "]" :ts -> cons t $ cons "]" $ rend i ts t :ts -> space t $ rend i ts _ -> "" cons s t = s ++ t space t s = if null s then t else t ++ " " ++ s parenth :: [String] -> [String] parenth ss = ["("] ++ ss ++ [")"] -- the printer class does the job class Print a where prt :: Int -> a -> [String] prtList :: [a] -> [String] prtList = concat . map (prt 0) instance Print a => Print [a] where prt _ = prtList instance Print Char where prt _ c = [show c] -- if isAlphaNum c then [[c]] else ['\\':[c]] prtList s = [show s] -- map (concat . prt 0) s prPrec :: Int -> Int -> [String] -> [String] prPrec i j = if j prPrec i 2 (concat [prt 2 reg0 , prt 3 reg]) RAlt reg0 reg -> prPrec i 1 (concat [prt 1 reg0 , ["|"] , prt 2 reg]) RMinus reg0 reg -> prPrec i 1 (concat [prt 2 reg0 , ["#"] , prt 2 reg]) RStar reg -> prPrec i 3 (concat [prt 3 reg , ["*"]]) RPlus reg -> prPrec i 3 (concat [prt 3 reg , ["+"]]) ROpt reg -> prPrec i 3 (concat [prt 3 reg , ["?"]]) REps -> prPrec i 3 (["\"\""]) -- special construct for eps in ocamllex? RChar c -> prPrec i 3 (concat [prt 0 c]) RAlts str -> prPrec i 3 (concat [["["], [concatMap show str], ["]"]]) RSeqs str -> prPrec i 2 (concat (map (prt 0) str)) RDigit -> prPrec i 3 (concat [["d"]]) RLetter -> prPrec i 3 (concat [["l"]]) RUpper -> prPrec i 3 (concat [["c"]]) RLower -> prPrec i 3 (concat [["s"]]) RAny -> prPrec i 3 (concat [["u"]])