{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}

{-
    BNF Converter: Java Antlr4 Lexer generator
    Copyright (C) 2015  Author:  Gabriele Paganelli

    Description   : This module generates the Antlr4 input file.
                    Based on CFtoJLex15.hs

    Author        : Gabriele Paganelli (gapag@distruzione.org)
    Created       : 15 Oct, 2015

-}

module BNFC.Backend.Java.CFtoAntlr4Lexer ( cf2AntlrLex ) where

import Prelude hiding ((<>))

import Text.PrettyPrint
import BNFC.CF
import BNFC.Backend.Java.RegToAntlrLexer
import BNFC.Backend.Common.NamedVariables

-- | Creates a lexer grammar.
-- Since antlr token identifiers must start with an uppercase symbol,
-- I prepend "Surrogate_id_SYMB_" to the identifier.
-- This introduces risks of clashes if somebody uses the same identifier for
-- user defined tokens. This is not handled.
-- returns the environment because the parser uses it.
cf2AntlrLex :: String -> CF -> (Doc, KeywordEnv)
cf2AntlrLex :: String -> CF -> (Doc, KeywordEnv)
cf2AntlrLex String
lang CF
cf = (,KeywordEnv
env) (Doc -> (Doc, KeywordEnv)) -> Doc -> (Doc, KeywordEnv)
forall a b. (a -> b) -> a -> b
$ [Doc] -> Doc
vcat
    [ String -> Doc
prelude String
lang
    , Doc
cMacros
    -- unnamed symbols (those in quotes, not in token definitions)
    , KeywordEnv -> Doc
lexSymbols KeywordEnv
env
    , CF -> Doc
restOfLexerGrammar CF
cf
    ]
  where
    env :: KeywordEnv
env = [String] -> [String] -> KeywordEnv
forall a b. [a] -> [b] -> [(a, b)]
zip (CF -> [String]
forall function. CFG function -> [String]
cfgSymbols CF
cf [String] -> [String] -> [String]
forall a. [a] -> [a] -> [a]
++ CF -> [String]
forall function. CFG function -> [String]
reservedWords CF
cf) ([String] -> KeywordEnv) -> [String] -> KeywordEnv
forall a b. (a -> b) -> a -> b
$ (Int -> String) -> [Int] -> [String]
forall a b. (a -> b) -> [a] -> [b]
map ((String
"Surrogate_id_SYMB_" String -> String -> String
forall a. [a] -> [a] -> [a]
++) (String -> String) -> (Int -> String) -> Int -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> String
forall a. Show a => a -> String
show) [Int
0 :: Int ..]


-- | File prelude
prelude :: String -> Doc
prelude :: String -> Doc
prelude String
lang = [Doc] -> Doc
vcat
    [ Doc
"// Lexer definition for use with Antlr4"
    , Doc
"lexer grammar" Doc -> Doc -> Doc
<+> String -> Doc
text String
lang Doc -> Doc -> Doc
<> Doc
"Lexer;"
    ]

--For now all categories are included.
--Optimally only the ones that are used should be generated.
cMacros :: Doc
cMacros :: Doc
cMacros = [Doc] -> Doc
vcat
    [ Doc
"// Predefined regular expressions in BNFC"
    , Doc -> Doc
frg Doc
"LETTER  : CAPITAL | SMALL"
    , Doc -> Doc
frg Doc
"CAPITAL : [A-Z\\u00C0-\\u00D6\\u00D8-\\u00DE]"
    , Doc -> Doc
frg Doc
"SMALL   : [a-z\\u00DF-\\u00F6\\u00F8-\\u00FF]"
    , Doc -> Doc
frg Doc
"DIGIT   : [0-9]"
    ]
  where frg :: Doc -> Doc
frg Doc
a = Doc
"fragment" Doc -> Doc -> Doc
<+> Doc
a Doc -> Doc -> Doc
<+> Doc
";"

escapeChars :: String -> String
escapeChars :: String -> String
escapeChars = (Char -> String) -> String -> String
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap Char -> String
escapeCharInSingleQuotes

-- |
-- >>> lexSymbols [("foo","bar")]
-- bar : 'foo' ;
-- >>> lexSymbols [("\\","bar")]
-- bar : '\\' ;
-- >>> lexSymbols [("/","bar")]
-- bar : '/' ;
-- >>> lexSymbols [("~","bar")]
-- bar : '~' ;
lexSymbols :: KeywordEnv -> Doc
lexSymbols :: KeywordEnv -> Doc
lexSymbols KeywordEnv
ss = [Doc] -> Doc
vcat ([Doc] -> Doc) -> [Doc] -> Doc
forall a b. (a -> b) -> a -> b
$  ((String, String) -> Doc) -> KeywordEnv -> [Doc]
forall a b. (a -> b) -> [a] -> [b]
map (String, String) -> Doc
transSym KeywordEnv
ss
  where
    transSym :: (String, String) -> Doc
transSym (String
s,String
r) = String -> Doc
text String
r Doc -> Doc -> Doc
<>  Doc
" : '" Doc -> Doc -> Doc
<> String -> Doc
text (String -> String
escapeChars String
s) Doc -> Doc -> Doc
<> Doc
"' ;"

-- | Writes rules for user defined tokens, and, if used, the predefined BNFC tokens.
restOfLexerGrammar :: CF -> Doc
restOfLexerGrammar :: CF -> Doc
restOfLexerGrammar CF
cf = [Doc] -> Doc
vcat
    [ (KeywordEnv, [String]) -> Doc
lexComments (CF -> (KeywordEnv, [String])
comments CF
cf)
    , Doc
""
    , Doc
userDefTokens
    , [Doc] -> Doc
ifString [Doc]
strdec
    , [Doc] -> Doc
ifChar [Doc]
chardec
    , String -> [Doc] -> Doc
ifC String
catDouble [
        Doc
"// Double predefined token type",
        Doc
"DOUBLE : DIGIT+ '.' DIGIT+ ('e' '-'? DIGIT+)?;"
        ]
    , String -> [Doc] -> Doc
ifC String
catInteger [
        Doc
"//Integer predefined token type",
        Doc
"INTEGER : DIGIT+;"
        ]
    , String -> [Doc] -> Doc
ifC String
catIdent [
        Doc
"// Identifier token type" ,
        Doc
"fragment" ,
        Doc
"IDENTIFIER_FIRST : LETTER | '_';",
        Doc
"IDENT : IDENTIFIER_FIRST (IDENTIFIER_FIRST | DIGIT)*;"
        ]
    , Doc
"// Whitespace"
    , Doc
"WS : (' ' | '\\r' | '\\t' | '\\n' | '\\f')+ ->  skip;"
    , Doc
"// Escapable sequences"
    , Doc
"fragment"
    , Doc
"Escapable : ('\"' | '\\\\' | 'n' | 't' | 'r' | 'f');"
    , Doc
"ErrorToken : . ;"
    , [Doc] -> Doc
ifString [Doc]
stringmodes
    , [Doc] -> Doc
ifChar [Doc]
charmodes
    ]
  where
    ifC :: String -> [Doc] -> Doc
ifC String
cat [Doc]
s     = if CF -> Cat -> Bool
forall f. CFG f -> Cat -> Bool
isUsedCat CF
cf (String -> Cat
TokenCat String
cat) then [Doc] -> Doc
vcat [Doc]
s else Doc
""
    ifString :: [Doc] -> Doc
ifString      = String -> [Doc] -> Doc
ifC String
catString
    ifChar :: [Doc] -> Doc
ifChar        = String -> [Doc] -> Doc
ifC String
catChar
    strdec :: [Doc]
strdec        = [ Doc
"// String token type"
                    , Doc
"STRING : '\"' -> more, mode(STRINGMODE);"
                    ]
    chardec :: [Doc]
chardec       = [Doc
"CHAR : '\\''   -> more, mode(CHARMODE);"]
    userDefTokens :: Doc
userDefTokens = [Doc] -> Doc
vcat
        [ String -> Doc
text String
name Doc -> Doc -> Doc
<> Doc
" : " Doc -> Doc -> Doc
<> String -> Doc
text (Reg -> String
printRegJLex Reg
exp) Doc -> Doc -> Doc
<> Doc
";"
        | (String
name, Reg
exp) <- CF -> [(String, Reg)]
forall f. CFG f -> [(String, Reg)]
tokenPragmas CF
cf ]
    stringmodes :: [Doc]
stringmodes   = [ Doc
"mode STRESCAPE;"
        , Doc
"STRESCAPED : Escapable  -> more, popMode ;"
        , Doc
"mode STRINGMODE;"
        , Doc
"STRINGESC : '\\\\' -> more , pushMode(STRESCAPE);"
        , Doc
"STRINGEND : '\"' ->  type(STRING), mode(DEFAULT_MODE);"
        , Doc
"STRINGTEXT : ~[\"\\\\] -> more;"
        ]
    charmodes :: [Doc]
charmodes     = [ Doc
"mode CHARMODE;"
        , Doc
"CHARANY     :  ~['\\\\] -> more, mode(CHAREND);"
        , Doc
"CHARESC     :  '\\\\'  -> more, pushMode(CHAREND),pushMode(ESCAPE);"
        , Doc
"mode ESCAPE;"
        , Doc
"ESCAPED : (Escapable | '\\'')  -> more, popMode ;"
        , Doc
"mode CHAREND;"
        , Doc
"CHARENDC     :  '\\''  -> type(CHAR), mode(DEFAULT_MODE);"
        ]

lexComments :: ([(String, String)], [String]) -> Doc
lexComments :: (KeywordEnv, [String]) -> Doc
lexComments ([],[]) = Doc
""
lexComments (KeywordEnv
m,[String]
s) = [Doc] -> Doc
vcat
    (Doc -> (String -> Doc) -> [String] -> [Doc]
forall {a}. Doc -> (a -> Doc) -> [a] -> [Doc]
prod Doc
"COMMENT_antlr_builtin" String -> Doc
lexSingleComment [String]
s [Doc] -> [Doc] -> [Doc]
forall a. [a] -> [a] -> [a]
++
         Doc -> ((String, String) -> Doc) -> KeywordEnv -> [Doc]
forall {a}. Doc -> (a -> Doc) -> [a] -> [Doc]
prod Doc
"MULTICOMMENT_antlr_builtin" (String, String) -> Doc
lexMultiComment KeywordEnv
m )

  where
    prod :: Doc -> (a -> Doc) -> [a] -> [Doc]
prod Doc
bg a -> Doc
lc [a]
ty = [Doc
bg, Doc
": ("] [Doc] -> [Doc] -> [Doc]
forall a. [a] -> [a] -> [a]
++ Doc -> [Doc] -> [Doc]
punctuate Doc
"|" ((a -> Doc) -> [a] -> [Doc]
forall a b. (a -> b) -> [a] -> [b]
map a -> Doc
lc [a]
ty) [Doc] -> [Doc] -> [Doc]
forall a. [a] -> [a] -> [a]
++ [Doc]
skiplex
    skiplex :: [Doc]
skiplex       = [Doc
") -> skip;"]

-- | Create lexer rule for single-line comments.
--
-- >>> lexSingleComment "--"
-- '--' ~[\r\n]* (('\r'? '\n')|EOF)
--
-- >>> lexSingleComment "\""
-- '"' ~[\r\n]* (('\r'? '\n')|EOF)
lexSingleComment :: String -> Doc
lexSingleComment :: String -> Doc
lexSingleComment String
c =
    Doc
"'" Doc -> Doc -> Doc
<>String -> Doc
text (String -> String
escapeChars String
c) Doc -> Doc -> Doc
<>  Doc
"' ~[\\r\\n]* (('\\r'? '\\n')|EOF)"

-- | Create lexer rule for multi-lines comments.
--
-- There might be a possible bug here if a language includes 2 multi-line
-- comments. They could possibly start a comment with one character and end it
-- with another. However this seems rare.
--
-- >>> lexMultiComment ("{-", "-}")
-- '{-' (.)*? '-}'
--
-- >>> lexMultiComment ("\"'", "'\"")
-- '"\'' (.)*? '\'"'
lexMultiComment :: (String, String) -> Doc
lexMultiComment :: (String, String) -> Doc
lexMultiComment (String
b,String
e) =
         Doc
"'" Doc -> Doc -> Doc
<> String -> Doc
text (String -> String
escapeChars String
b)
        Doc -> Doc -> Doc
<>Doc
"' (.)*? '"Doc -> Doc -> Doc
<> String -> Doc
text (String -> String
escapeChars String
e)
        Doc -> Doc -> Doc
<> Doc
"'"