{-# LANGUAGE PatternGuards #-}

module BNFC.Lexing
    ( mkLexer, LexType(..), mkRegMultilineComment ) where

import Prelude hiding ((<>))

-- import Control.Arrow ( (&&&) )
import Data.List     ( inits, tails )

import BNFC.Abs      ( Reg(..)   )
import BNFC.Print    ( printTree )  -- for debug printing
import BNFC.CF
import BNFC.Regex    ( simpReg )
import BNFC.Utils    ( unless  )

debugPrint :: Reg -> IO ()
debugPrint :: Reg -> IO ()
debugPrint = [Char] -> IO ()
putStrLn ([Char] -> IO ()) -> (Reg -> [Char]) -> Reg -> IO ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [[Char]] -> [Char]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat ([[Char]] -> [Char]) -> (Reg -> [[Char]]) -> Reg -> [Char]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Char] -> [[Char]]
words ([Char] -> [[Char]]) -> (Reg -> [Char]) -> Reg -> [[Char]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Reg -> [Char]
forall a. Print a => a -> [Char]
printTree

-- Abstract lexer

data LexType = LexComment | LexToken String | LexSymbols

mkLexer :: CF -> [(Reg, LexType)]
mkLexer :: CF -> [(Reg, LexType)]
mkLexer CF
cf = [[(Reg, LexType)]] -> [(Reg, LexType)]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat
    -- comments
  [ [ ([Char] -> Reg
mkRegSingleLineComment [Char]
s, LexType
LexComment) | [Char]
s <- ([([Char], [Char])], [[Char]]) -> [[Char]]
forall a b. (a, b) -> b
snd (CF -> ([([Char], [Char])], [[Char]])
comments CF
cf) ]
  , [ ([Char] -> [Char] -> Reg
mkRegMultilineComment [Char]
b [Char]
e, LexType
LexComment) | ([Char]
b,[Char]
e) <- ([([Char], [Char])], [[Char]]) -> [([Char], [Char])]
forall a b. (a, b) -> a
fst (CF -> ([([Char], [Char])], [[Char]])
comments CF
cf) ]
    -- user tokens
  , [ (Reg
reg, [Char] -> LexType
LexToken [Char]
name) | ([Char]
name, Reg
reg) <- CF -> [([Char], Reg)]
forall f. CFG f -> [([Char], Reg)]
tokenPragmas CF
cf]
    -- predefined tokens
  , [ ( Reg
regIdent, [Char] -> LexType
LexToken [Char]
"Ident" ) ]
    -- Symbols
  , Bool -> [(Reg, LexType)] -> [(Reg, LexType)]
forall m. Monoid m => Bool -> m -> m
unless ([[Char]] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null ([[Char]] -> Bool) -> [[Char]] -> Bool
forall a b. (a -> b) -> a -> b
$ CF -> [[Char]]
forall function. CFG function -> [[Char]]
cfgSymbols CF
cf) [ ((Reg -> Reg -> Reg) -> [Reg] -> Reg
forall (t :: * -> *) a. Foldable t => (a -> a -> a) -> t a -> a
foldl1 Reg -> Reg -> Reg
RAlt (([Char] -> Reg) -> [[Char]] -> [Reg]
forall a b. (a -> b) -> [a] -> [b]
map [Char] -> Reg
RSeqs (CF -> [[Char]]
forall function. CFG function -> [[Char]]
cfgSymbols CF
cf)), LexType
LexSymbols ) ]
    -- Built-ins
  , [ ( Reg
regInteger, [Char] -> LexType
LexToken [Char]
"Integer")
    , ( Reg
regDouble , [Char] -> LexType
LexToken [Char]
"Double" )
    , ( Reg
regString , [Char] -> LexType
LexToken [Char]
"String" )
    , ( Reg
regChar   , [Char] -> LexType
LexToken [Char]
"Char"   )
    ]
  ]


<> :: Reg -> Reg -> Reg
(<>) = Reg -> Reg -> Reg
RSeq
<|> :: Reg -> Reg -> Reg
(<|>) = Reg -> Reg -> Reg
RAlt

-- Bult-in tokens
-- the tests make sure that they correspond to what is in the LBNF reference

-- | Ident regex
-- >>> debugPrint regIdent
-- letter(letter|digit|'_'|'\'')*
regIdent :: Reg
regIdent :: Reg
regIdent = Reg
RLetter Reg -> Reg -> Reg
<> Reg -> Reg
RStar (Reg
RLetter Reg -> Reg -> Reg
<|> Reg
RDigit Reg -> Reg -> Reg
<|> Char -> Reg
RChar Char
'_' Reg -> Reg -> Reg
<|> Char -> Reg
RChar Char
'\'')

-- | Integer regex
-- >>> debugPrint regInteger
-- digit+
regInteger :: Reg
regInteger :: Reg
regInteger = Reg -> Reg
RPlus Reg
RDigit

-- | String regex
-- >>> debugPrint regString
-- '"'(char-["\"\\"]|'\\'["\"\\nt"])*'"'
regString :: Reg
regString :: Reg
regString = Char -> Reg
RChar Char
'"'
            Reg -> Reg -> Reg
<> Reg -> Reg
RStar ((Reg
RAny Reg -> Reg -> Reg
`RMinus` [Char] -> Reg
RAlts [Char]
"\"\\")
                       Reg -> Reg -> Reg
<|> (Char -> Reg
RChar Char
'\\' Reg -> Reg -> Reg
<> [Char] -> Reg
RAlts [Char]
"\"\\nt"))
            Reg -> Reg -> Reg
<> Char -> Reg
RChar Char
'"'

-- | Char regex
-- >>> debugPrint regChar
-- '\''(char-["'\\"]|'\\'["'\\nt"])'\''
regChar :: Reg
regChar :: Reg
regChar = Char -> Reg
RChar Char
'\''
          Reg -> Reg -> Reg
<> (Reg -> Reg -> Reg
RMinus Reg
RAny ([Char] -> Reg
RAlts [Char]
"'\\") Reg -> Reg -> Reg
<|> (Char -> Reg
RChar Char
'\\' Reg -> Reg -> Reg
<> [Char] -> Reg
RAlts [Char]
"'\\nt"))
          Reg -> Reg -> Reg
<> Char -> Reg
RChar Char
'\''

-- | Double regex
-- >>> debugPrint regDouble
-- digit+'.'digit+('e''-'?digit+)?
regDouble :: Reg
regDouble :: Reg
regDouble = Reg -> Reg
RPlus Reg
RDigit Reg -> Reg -> Reg
<> Char -> Reg
RChar Char
'.' Reg -> Reg -> Reg
<> Reg -> Reg
RPlus Reg
RDigit
            Reg -> Reg -> Reg
<> Reg -> Reg
ROpt (Char -> Reg
RChar Char
'e' Reg -> Reg -> Reg
<> Reg -> Reg
ROpt (Char -> Reg
RChar Char
'-') Reg -> Reg -> Reg
<> Reg -> Reg
RPlus Reg
RDigit)

-- | Create regex for single line comments
-- >>> debugPrint $ mkRegSingleLineComment "--"
-- {"--"}char*'\n'
mkRegSingleLineComment :: String -> Reg
mkRegSingleLineComment :: [Char] -> Reg
mkRegSingleLineComment [Char]
s = [Char] -> Reg
RSeqs [Char]
s Reg -> Reg -> Reg
<> Reg -> Reg
RStar Reg
RAny Reg -> Reg -> Reg
<> Char -> Reg
RChar Char
'\n'


-- -- | Create regex for multiline comments.
-- --
-- -- >>> debugPrint $ mkRegMultilineComment "<" ">"
-- -- '<'(char-'>')*'>'
-- --
-- -- >>> debugPrint $ mkRegMultilineComment "<!--" "-->"
-- -- {"<!--"}(char-'-')*'-'((char-'-')+'-')*'-'('-'|(char-["->"])(char-'-')*'-'((char-'-')+'-')*'-')*'>'
-- --
-- mkRegMultilineComment :: String -> String -> Reg
-- mkRegMultilineComment b []       = RSeqs b
-- mkRegMultilineComment b (a:rest) = simpReg $ RSeqs b `RSeq` fromStart
--   where
--   notA                       = RAny `RMinus` RChar a
--   goA                        = RStar notA `RSeq` RChar a
--   (fromStart, _, _)          = foldl f (goA, REps, []) rest
--   -- Build up automaton states Start, A, ...ys..., x, ...
--   f (fromStart, fromA, ys) x = (advance fromStart, advance fromA, x:ys)
--     where
--     advance from = (from `RSeq` RStar idle) `RSeq` RChar x
--     idle         = foldl1 RAlt $ concat
--       -- cannot advance, ...
--       [ [ RChar a              | a /= x, all (a ==) ys            ] -- but can stay
--       , [ RChar a `RSeq` fromA | a /= x, null ys || any (a /=) ys ] -- but can fall back to A
--       , [ (RAny `RMinus` RAlts [x,a]) `RSeq` fromStart            ] -- neither, need to restart
--       ]


-- | Create regex for multiline comments.
--
-- >>> debugPrint $ mkRegMultilineComment "<" ">"
-- '<'(char-'>')*'>'
--
-- >>> debugPrint $ mkRegMultilineComment "/*" "*/"
-- {"/*"}(char-'*')*'*'((char-["*/"])(char-'*')*'*'|'*')*'/'
--
-- >>> debugPrint $ mkRegMultilineComment "<!--" "-->"
-- {"<!--"}(char-'-')*'-'((char-'-')+'-')*'-'((char-["->"])(char-'-')*'-'((char-'-')+'-')*'-'|'-')*'>'
--
mkRegMultilineComment :: String -> String -> Reg
mkRegMultilineComment :: [Char] -> [Char] -> Reg
mkRegMultilineComment [Char]
begin [Char]
end = Reg -> Reg
simpReg (Reg -> Reg) -> Reg -> Reg
forall a b. (a -> b) -> a -> b
$ Reg -> [Reg] -> Reg
joinSteps ([Char] -> Reg
RSeqs [Char]
begin) [Reg]
allSteps
  where

  -- This handles cases beyond comment terminators such as "*/" and "-->".
  -- In the general but unlikely case, a comment terminator may have
  -- non-trivial internal repetitions, like in "ananas".  While lexing
  -- "anananas", we need, after having seen "anana", fall back to state
  -- "ana", to correctly handle the rest "nas" of the input and recognize the
  -- comment terminator.

  -- See the Knuth-Morris-Pratt algorithm of complexity O(n+m) to recognize a
  -- keyword of length m in a text of length n.
  -- (Dragon book second edition section 3.4.5;
  -- Knuth/Morris/Pratt (J. Computing 1977),
  -- "Fast pattern matching on strings").

  -- The basic idea is to construct the regular expression to recognize
  -- a text not containing @end@ but ending in @end@ from this DFA:
  --
  -- * DFA-states: the prefixes of @end@, formally @inits end@,
  --   written a(1..i) for @i <= length end@.
  --
  -- * Primary transitions ("spine") take us from state a(1..i) (called @ys@)
  --   to a(1..i+1) (called @x:ys@), consuming character a(i+1) (called @x@).
  --
  -- * Fallback transitions take us from state a(1..i) (@ys@) to some previous
  --   state a(1..j) with j <= i, consuming character @z@=a(j) (unless j=0).
  --   The main condition for fallbacks is a(i-j+2..i)=a(1..j-1) ("suffix = prefix"),
  --   because then we can append a(j) to our truncated history a(i-j+2..i)
  --   and end up in a(1..j).
  --   The secondary condition is that we are obliged to not fall back further
  --   than we must:  If consuming @z@ can get us to a(1..k) with k > j,
  --   we cannot fall back to a(1..j).
  --
  -- The final @Reg@ transitions along the spine also involve "idling" on a state,
  -- meaning transition sequences bringing us back to the same state.
  -- The list @steps@ will contain the "spine" transitions (a(1..i)->a(1..i+1))
  -- _including_ the idling.  The first entry in the list is the last transition
  -- computed so far.  @allSteps@ is then the complete @steps@ list, which can be
  -- joined by @RSeq@ (function @joinSteps@).
  --
  -- Remark:
  -- Note that the generated regex can be too big for lexers to handle.
  -- For the example @end == "ananas"@, ocamllex uses up ~30.000 of its
  -- 32.767 maximal automaton transitions, which prevents comments
  -- ending in "ananas" to be part of a ocamllex lexer definition in practice.
  -- The Haskell lexer generator Alex is slow as well on this example,
  -- although the produced lexer is unproblematic in the end.
  --
  -- Lexer generators _should_ be able to handle the regex we are producing here
  -- because the DFA has only O(n) states and O(n²) transitions where @n = length end@
  -- is the length of the comment terminator @end@.
  --
  -- It is just an awkward way to generate this DFA via the detour over a regex
  -- which in turn is dictated by the interface of lexer generators.
  -- The size of the regex tree seems to be O(n³)!?
  -- It would be much smaller as DAG (tree with sharing).
  -- Lexer generators often support regex definitions; we could make each entry
  -- in @steps@ a defined regex.  However, it is not clear whether this sharing
  -- is utilized in the regex → NFA construction in the lexer generators.
  joinSteps :: Reg -> [Reg] -> Reg
  joinSteps :: Reg -> [Reg] -> Reg
joinSteps = (Reg -> Reg -> Reg) -> Reg -> [Reg] -> Reg
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr ((Reg -> Reg -> Reg) -> Reg -> Reg -> Reg
forall a b c. (a -> b -> c) -> b -> a -> c
flip Reg -> Reg -> Reg
RSeq)
  -- Transitions of the spine of the automaton, with last transition first in the list.
  allSteps :: [Reg]
  allSteps :: [Reg]
allSteps = ([Reg], [Char]) -> [Reg]
forall a b. (a, b) -> a
fst (([Reg], [Char]) -> [Reg]) -> ([Reg], [Char]) -> [Reg]
forall a b. (a -> b) -> a -> b
$ (([Reg], [Char]) -> Char -> ([Reg], [Char]))
-> ([Reg], [Char]) -> [Char] -> ([Reg], [Char])
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl ([Reg], [Char]) -> Char -> ([Reg], [Char])
next ([],[]) [Char]
end

  -- @next (steps, ys) x@ calculates the next step,
  -- taking us from state @ys@ to state @x:ys@.
  next :: ([Reg],[Char]) -> Char -> ([Reg],[Char])
  next :: ([Reg], [Char]) -> Char -> ([Reg], [Char])
next
    ( [Reg]
steps  -- [r(i-1,i), ..., r(0,1)], empty if i == 0
    , [Char]
ys     -- [a(i),...,a(1)]        , empty if i == 0
    ) Char
x      -- a(i+1)
    = (Reg
step Reg -> [Reg] -> [Reg]
forall a. a -> [a] -> [a]
: [Reg]
steps, Char
xChar -> [Char] -> [Char]
forall a. a -> [a] -> [a]
:[Char]
ys)
    where

    -- step = r(i,i+1) is the regular expression to go to next state.
    -- We can idle on state a(1..i) and then take the transition to a(1..i+1).
    step :: Reg
    step :: Reg
step = Reg -> Reg
RStar Reg
idle Reg -> Reg -> Reg
`RSeq` Char -> Reg
RChar Char
x
    -- @idle@ presents all the possibilities to stay on the current state
    -- or fall back to a previous state and then again advance to the present state.
    -- We consider first the possibility to fall back to the start state a(1..0),
    -- and then the possibility to fall back to a(1..1), then, to a(1..2), etc.,
    -- until staying on a(1..i).
    -- We are obliged to stay as far advanced as possible, we can only fall
    -- father back if we cannot stay more upfront.
    -- Transitioning to state a(1..j) is possible if
    --   * the next character is not x (a(i+1)),
    --   * the next character is a(j),
    --   * the last j-1 characters we processed, a(i-j+2..j) are a(1..j-1),
    --   * we cannot transition to a(1..j+1), a(1..j+2), ..., a(1..i).
    idle :: Reg
    idle :: Reg
idle = (Reg -> Reg -> Reg) -> Reg -> [Reg] -> Reg
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl Reg -> Reg -> Reg
RAlt Reg
toStart ([Reg] -> Reg) -> [Reg] -> Reg
forall a b. (a -> b) -> a -> b
$ ((Char, Reg) -> Reg) -> [(Char, Reg)] -> [Reg]
forall a b. (a -> b) -> [a] -> [b]
map (Char, Reg) -> Reg
forall a b. (a, b) -> b
snd [(Char, Reg)]
possibilities
      where
      -- List of possibilities to go back to a previous state upon
      -- the given character and how to return to the present state.
      -- We calculate the possibilities in order of:
      --   * staying on the current state
      --   * falling back one state
      --   * falling back two states
      --   * ...
      --   * falling back to the start.
      -- The reason is that falling back further than necessary is not allowed.
      possibilities :: [(Char,Reg)]
      possibilities :: [(Char, Reg)]
possibilities = ([(Char, Reg)] -> (Char, Bool, [Reg]) -> [(Char, Reg)])
-> [(Char, Reg)] -> [(Char, Bool, [Reg])] -> [(Char, Reg)]
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl [(Char, Reg)] -> (Char, Bool, [Reg]) -> [(Char, Reg)]
addPoss [] ([Char] -> [Bool] -> [[Reg]] -> [(Char, Bool, [Reg])]
forall a b c. [a] -> [b] -> [c] -> [(a, b, c)]
zip3 [Char]
ys [Bool]
conds ([[Reg]] -> [(Char, Bool, [Reg])])
-> [[Reg]] -> [(Char, Bool, [Reg])]
forall a b. (a -> b) -> a -> b
$ [Reg] -> [[Reg]]
forall a. [a] -> [[a]]
inits [Reg]
steps)
      -- Fall back to the beginning and come back to the present state.
      toStart :: Reg
      toStart :: Reg
toStart = Reg -> [Reg] -> Reg
joinSteps (Reg
RAny Reg -> Reg -> Reg
`RMinus` [Char] -> Reg
RAlts (Char
x Char -> [Char] -> [Char]
forall a. a -> [a] -> [a]
: ((Char, Reg) -> Char) -> [(Char, Reg)] -> [Char]
forall a b. (a -> b) -> [a] -> [b]
map (Char, Reg) -> Char
forall a b. (a, b) -> a
fst [(Char, Reg)]
possibilities)) [Reg]
steps
      -- Adding a possiblity on top of the existing ones.
      addPoss :: [(Char,Reg)] -> (Char,Bool,[Reg]) -> [(Char,Reg)]
      addPoss :: [(Char, Reg)] -> (Char, Bool, [Reg]) -> [(Char, Reg)]
addPoss
        [(Char, Reg)]
poss                -- List of possibilities (a(k),r) of falling back to a(k) and recovering to a(i) via r.
        (Char
z, Bool
cond, [Reg]
steps)    -- Investigating possibility to fall back to a(1..j) where cond says this is in principle
                            -- possible if we read @z@, not @x@, and none of the previous possibilities.
                            -- @steps@ brings us back to the current state (after falling back).
        | Bool
cond, Char
z Char -> [Char] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`notElem` [Char]
exclude = (Char
z, Reg -> [Reg] -> Reg
joinSteps (Char -> Reg
RChar Char
z) [Reg]
steps) (Char, Reg) -> [(Char, Reg)] -> [(Char, Reg)]
forall a. a -> [a] -> [a]
: [(Char, Reg)]
poss
        | Bool
otherwise = [(Char, Reg)]
poss
        where
        -- To fall back with @z@, we need to exclude the possibility of
        -- advancing (via character @x@) and falling back less.
        exclude :: [Char]
        exclude :: [Char]
exclude = Char
x Char -> [Char] -> [Char]
forall a. a -> [a] -> [a]
: ((Char, Reg) -> Char) -> [(Char, Reg)] -> [Char]
forall a b. (a -> b) -> [a] -> [b]
map (Char, Reg) -> Char
forall a b. (a, b) -> a
fst [(Char, Reg)]
poss
      -- Conditions of whether a fallback is in principle possible,
      -- starting with the state we have been in previously, ending in the first state.
      -- If we are in state a(1..i), the possibility of falling back to a(1..j)
      -- is constrained on a(1..j-1) = a(i-j+2..i).
      conds :: [Bool]
      conds :: [Bool]
conds = ([Char] -> [Char] -> Bool) -> [[Char]] -> [[Char]] -> [Bool]
forall a b c. (a -> b -> c) -> [a] -> [b] -> [c]
zipWith [Char] -> [Char] -> Bool
forall a. Eq a => a -> a -> Bool
(==) ([[Char]] -> [[Char]]
forall a. [a] -> [a]
tail ([[Char]] -> [[Char]]) -> [[Char]] -> [[Char]]
forall a b. (a -> b) -> a -> b
$ [[Char]] -> [[Char]]
forall a. [a] -> [a]
reverse ([[Char]] -> [[Char]]) -> [[Char]] -> [[Char]]
forall a b. (a -> b) -> a -> b
$ [Char] -> [[Char]]
forall a. [a] -> [[a]]
inits [Char]
ys) ([[Char]] -> [[Char]]
forall a. [a] -> [a]
tail ([[Char]] -> [[Char]]) -> [[Char]] -> [[Char]]
forall a b. (a -> b) -> a -> b
$ [Char] -> [[Char]]
forall a. [a] -> [[a]]
tails [Char]
ys)