-- -----------------------------------------------------------------------------
--
-- DFA.hs, part of Alex
--
-- (c) Chris Dornan 1995-2000, Simon Marlow 2003
--
-- This module generates a DFA from a scanner by first converting it
-- to an NFA and then converting the NFA with the subset construction.
--
-- See the chapter on `Finite Automata and Lexical Analysis' in the
-- dragon book for an excellent overview of the algorithms in this
-- module.
--
-- ----------------------------------------------------------------------------}

module DFA (scanner2dfa) where

import Data.Array    ( (!) )
import Data.Function ( on )
import Data.Maybe    ( fromJust )

import qualified Data.IntMap as IntMap
import qualified Data.IntSet as IntSet
import qualified Data.Map    as Map
import qualified Data.List   as List

import AbsSyn
import NFA
import CharSet

{-                        Defined in the Scan Module

-- (This section should logically belong to the DFA module but it has been
-- placed here to make this module self-contained.)
--
-- `DFA' provides an alternative to `Scanner' (described in the RExp module);
-- it can be used directly to scan text efficiently.  Additionally it has an
-- extra place holder for holding action functions for generating
-- application-specific tokens.  When this place holder is not being used, the
-- unit type will be used.
--
-- Each state in the automaton consist of a list of `Accept' values, descending
-- in priority, and an array mapping characters to new states.  As the array
-- may only cover a sub-range of the characters, a default state number is
-- given in the third field.  By convention, all transitions to the -1 state
-- represent invalid transitions.
--
-- A list of accept states is provided for as the original specification may
-- have been ambiguous, in which case the highest priority token should be
-- taken (the one appearing earliest in the specification); this can not be
-- calculated when the DFA is generated in all cases as some of the tokens may
-- be associated with leading or trailing context or start codes.
--
-- `scan_token' (see above) can deal with unconditional accept states more
-- efficiently than those associated with context; to save it testing each time
-- whether the list of accept states contains an unconditional state, the flag
-- in the first field of `St' is set to true whenever the list contains an
-- unconditional state.
--
-- The `Accept' structure contains the priority of the token being accepted
-- (lower numbers => higher priorities), the name of the token, a place holder
-- that can be used for storing the `action' function for constructing the
-- token from the input text and the scanner's state, a list of start codes
-- (listing the start codes that the scanner must be in for the token to be
-- accepted; empty => no restriction), the leading and trailing context (both
-- `Nothing' if there is none).
--
-- The leading context consists simply of a character predicate that will
-- return true if the last character read is acceptable.  The trailing context
-- consists of an alternative starting state within the DFA; if this `sub-dfa'
-- turns up any accepting state when applied to the residual input then the
-- trailing context is acceptable (see `scan_token' above).

type DFA a = Array SNum (State a)

type SNum = Int

data State a = St Bool [Accept a] SNum (Array Char SNum)

data Accept a = Acc Int String a [StartCode] (MB(Char->Bool)) (MB SNum)

type StartCode = Int
-}


-- Scanners are converted to DFAs by converting them to NFAs first.  Converting
-- an NFA to a DFA works by identifying the states of the DFA with subsets of
-- the NFA.  The PartDFA is used to construct the DFA; it is essentially a DFA
-- in which the states are represented directly by state sets of the NFA.
-- `nfa2pdfa' constructs the partial DFA from the NFA by searching for all the
-- transitions from a given list of state sets, initially containing the start
-- state of the partial DFA, until all possible state sets have been considered
-- The final DFA is then constructed with a `mk_dfa'.

scanner2dfa:: Encoding -> Scanner -> [StartCode] -> DFA SNum Code
scanner2dfa enc scanner scs = nfa2dfa scs (scanner2nfa enc scanner scs)

nfa2dfa:: [StartCode] -> NFA -> DFA SNum Code
nfa2dfa scs nfa = mk_int_dfa nfa (nfa2pdfa nfa pdfa (dfa_start_states pdfa))
        where
        pdfa = new_pdfa n_starts nfa
        n_starts = length scs  -- number of start states

-- `nfa2pdfa' works by taking the next outstanding state set to be considered
-- and ignoring it if the state is already in the partial DFA, otherwise
-- generating all possible transitions from it, adding the new state to the
-- partial DFA and continuing the closure with the extra states.  Note the way
-- it incorporates the trailing context references into the search (by
-- including `rctx_ss' in the search).

nfa2pdfa:: NFA -> DFA StateSet Code -> [StateSet] -> DFA StateSet Code
nfa2pdfa _   pdfa [] = pdfa
nfa2pdfa nfa pdfa (ss:umkd)
  |  ss `in_pdfa` pdfa =  nfa2pdfa nfa pdfa  umkd
  |  otherwise         =  nfa2pdfa nfa pdfa' umkd'
  where
        pdfa' = add_pdfa ss (State accs (IntMap.fromList ss_outs)) pdfa

        umkd' = rctx_sss ++ map snd ss_outs ++ umkd

        -- for each character, the set of states that character would take
        -- us to from the current set of states in the NFA.
        ss_outs :: [(Int, StateSet)]
        ss_outs = [ (fromIntegral ch, mk_ss nfa ss')
                  | ch  <- byteSetElems $ setUnions [p | (p,_) <- outs],
                    let ss'  = [ s' | (p,s') <- outs, byteSetElem p ch ],
                    not (null ss')
                  ]

        rctx_sss = [ mk_ss nfa [s]
                   | Acc _ _ _ (RightContextRExp s) <- accs ]

        outs :: [(ByteSet,SNum)]
        outs =  [ out | s <- ss, out <- nst_outs (nfa ! s) ]

        accs = sort_accs [ acc | s <- ss, acc <- nst_accs (nfa ! s) ]

-- `sort_accs' sorts a list of accept values into descending order of priority,
-- eliminating any elements that follow an unconditional accept value.

sort_accs :: [Accept a] -> [Accept a]
sort_accs accs = foldr chk [] $ List.sortBy (compare `on` accPrio) accs
        where
        chk acc@(Acc _ _ Nothing NoRightContext) _   = [acc]
        chk acc                                  rst = acc:rst


{------------------------------------------------------------------------------
                          State Sets and Partial DFAs
------------------------------------------------------------------------------}


-- A `PartDFA' is a partially constructed DFA in which the states are
-- represented by sets of states of the original NFA.  It is represented by a
-- triple consisting of the start state of the partial DFA, the NFA from which
-- it is derived and a map from state sets to states of the partial DFA.  The
-- state set for a given list of NFA states is calculated by taking the epsilon
-- closure of all the states, sorting the result with duplicates eliminated.

type StateSet = [SNum]

new_pdfa :: Int -> NFA -> DFA StateSet a
new_pdfa starts nfa
 = DFA { dfa_start_states = [ List.sort $ nst_cl $ nfa ! n | n <- [0 .. starts - 1] ]
       , dfa_states       = Map.empty
       }

 -- starts is the number of start states

-- constructs the epsilon-closure of a set of NFA states
mk_ss :: NFA -> [SNum] -> StateSet
mk_ss nfa l = IntSet.toAscList $ IntSet.fromList [ s' | s <- l, s' <- nst_cl (nfa ! s) ]

add_pdfa:: StateSet -> State StateSet a -> DFA StateSet a -> DFA StateSet a
add_pdfa ss pst (DFA st mp) = DFA st (Map.insert ss pst mp)

in_pdfa:: StateSet -> DFA StateSet a -> Bool
in_pdfa ss (DFA _ mp) = ss `Map.member` mp

-- Construct a DFA with numbered states, from a DFA whose states are
-- sets of states from the original NFA.

mk_int_dfa:: NFA -> DFA StateSet a -> DFA SNum a
mk_int_dfa nfa (DFA start_states mp)
  = DFA [0 .. length start_states-1]
        (Map.fromList [ (lookup' st, cnv pds) | (st, pds) <- Map.toAscList mp ])
  where
        mp' = Map.fromList (zip (start_states ++
                                 (map fst . Map.toAscList) (foldr Map.delete mp start_states)) [0..])

        lookup' = fromJust . flip Map.lookup mp'

        cnv :: State StateSet a -> State SNum a
        cnv (State accs as) = State accs' as'
                where
                as'   = IntMap.mapWithKey (\_ch s -> lookup' s) as

                accs' = map cnv_acc accs
                cnv_acc (Acc p a lctx rctx) = Acc p a lctx rctx'
                  where rctx' =
                          case rctx of
                                RightContextRExp s ->
                                  RightContextRExp (lookup' (mk_ss nfa [s]))
                                other -> other