{-# LANGUAGE OverloadedStrings #-} {- BNF Converter: ocamllex Generator Copyright (C) 2005 Author: Kristofer Johannisson -} -- based on BNFC Haskell backend module BNFC.Backend.OCaml.CFtoOCamlLex (cf2ocamllex) where import Prelude hiding ((<>)) import Data.Char (ord) import qualified Data.List as List import Text.PrettyPrint hiding (render) import qualified Text.PrettyPrint as PP import BNFC.Abs import BNFC.CF import BNFC.Backend.Common (asciiKeywords, unicodeAndSymbols) import BNFC.Backend.OCaml.CFtoOCamlYacc (terminal) import BNFC.Backend.OCaml.OCamlUtil (mkEsc, ocamlTokenName) import BNFC.Lexing (mkRegMultilineComment) import BNFC.Utils (cstring, unless) cf2ocamllex :: String -> String -> CF -> String cf2ocamllex _ parserMod cf = unlines $ List.intercalate [""] [ header parserMod cf , cMacros , rMacros cf , uMacros cf , [ PP.render $ rules cf ] ] header :: String -> CF -> [String] header parserMod cf = List.intercalate [""] . filter (not . null) $ concat [ [ [ "(* Lexer definition for ocamllex. *)" , "" , "(* preamble *)" , "{" , "open " ++ parserMod , "open Lexing" ] ] , hashtables cf , [ [ "let unescapeInitTail (s:string) : string =" , " let rec unesc s = match s with" , " '\\\\'::c::cs when List.mem c ['\\\"'; '\\\\'; '\\\''] -> c :: unesc cs" , " | '\\\\'::'n'::cs -> '\\n' :: unesc cs" , " | '\\\\'::'t'::cs -> '\\t' :: unesc cs" , " | '\\\\'::'r'::cs -> '\\r' :: unesc cs" -- " | '\\\\'::'f'::cs -> '\\f' :: unesc cs", -- \f not supported by ocaml , " | '\\\"'::[] -> []" , " | c::cs -> c :: unesc cs" , " | _ -> []" , " (* explode/implode from caml FAQ *)" , " in let explode (s : string) : char list =" , " let rec exp i l =" , " if i < 0 then l else exp (i - 1) (s.[i] :: l) in" , " exp (String.length s - 1) []" , " in let implode (l : char list) : string =" , " let res = Buffer.create (List.length l) in" , " List.iter (Buffer.add_char res) l;" , " Buffer.contents res" , " in implode (unesc (List.tl (explode s)))" , "" , "let incr_lineno (lexbuf:Lexing.lexbuf) : unit =" , " let pos = lexbuf.lex_curr_p in" , " lexbuf.lex_curr_p <- { pos with" , " pos_lnum = pos.pos_lnum + 1;" , " pos_bol = pos.pos_cnum;" , " }" , "}" ] ] ] -- | Set up hashtables for reserved symbols and words. hashtables :: CF -> [[String]] hashtables cf = [ ht "symbol_table" $ unicodeAndSymbols cf , ht "resword_table" $ asciiKeywords cf ] where ht :: String -> [String] -> [String] ht table syms = unless (null syms) $ [ unwords [ "let", table, "= Hashtbl.create", show (length syms) ] , unwords [ "let _ = List.iter (fun (kwd, tok) -> Hashtbl.add", table, "kwd tok)" ] , concat [ " [", concat (List.intersperse ";" keyvals), "]" ] ] where keyvals = map (\ s -> concat [ "(", mkEsc s, ", ", terminal cf s, ")" ]) syms cMacros :: [String] cMacros = [ "(* BNFC character classes *)" , "let _letter = ['a'-'z' 'A'-'Z' '\\192' - '\\255'] # ['\\215' '\\247'] (* isolatin1 letter FIXME *)" , "let _upper = ['A'-'Z' '\\192'-'\\221'] # '\\215' (* capital isolatin1 letter FIXME *)" , "let _lower = ['a'-'z' '\\222'-'\\255'] # '\\247' (* small isolatin1 letter FIXME *)" , "let _digit = ['0'-'9'] (* _digit *)" , "let _idchar = _letter | _digit | ['_' '\\''] (* identifier character *)" , "let _universal = _ (* universal: any character *)" ] rMacros :: CF -> [String] rMacros cf | null symbs = [] | otherwise = [ "(* reserved words consisting of special symbols *)" , unwords $ "let rsyms =" : List.intersperse "|" (map mkEsc symbs) ] where symbs = unicodeAndSymbols cf -- user macros, derived from the user-defined tokens uMacros :: CF -> [String] uMacros cf = if null res then [] else "(* user-defined token types *)" : res where res = ["let " ++ name ++ " = " ++ rep | (name, rep, _, _) <- userTokens cf] -- | Returns the tuple of @(reg_name, reg_representation, token_name, is_position_token)@. userTokens :: CF -> [(String, String, String, Bool)] userTokens cf = [ (ocamlTokenName name, printRegOCaml reg, name, pos) | TokenReg n pos reg <- cfgPragmas cf , let name = wpThing n ] -- | Make OCamlLex rule -- >>> mkRule "token" [("REGEX1","ACTION1"),("REGULAREXPRESSION2","ACTION2"),("...","...")] -- (* lexing rules *) -- rule token = -- parse REGEX1 { ACTION1 } -- | REGULAREXPRESSION2 -- { ACTION2 } -- | ... { ... } -- -- If no regex are given, we dont create a lexer rule: -- >>> mkRule "empty" [] -- mkRule :: Doc -> [(Doc,Doc)] -> Doc mkRule _ [] = empty mkRule entrypoint (r:rs) = vcat [ "(* lexing rules *)" , "rule" <+> entrypoint <+> "=" , nest 2 $ hang "parse" 4 $ vcat $ nest 2 (mkOne r) : map (("|" <+>) . mkOne) rs ] where mkOne (regex, action) = regex $$ nest 8 (hsep ["{", action, "}"]) -- | Create regex for single line comments -- >>> mkRegexSingleLineComment "--" -- "--" (_ # '\n')* -- >>> mkRegexSingleLineComment "\"" -- "\"" (_ # '\n')* mkRegexSingleLineComment :: String -> Doc mkRegexSingleLineComment s = cstring s <+> "(_ # '\\n')*" -- | Create regex for multiline comments. -- >>> mkRegexMultilineComment "" -- "