-- -----------------------------------------------------------------------------
-- 
-- DFA.hs, part of Alex
--
-- (c) Chris Dornan 1995-2000, Simon Marlow 2003
--
-- This module generates a DFA from a scanner by first converting it
-- to an NFA and then converting the NFA with the subset construction.
-- 
-- See the chapter on `Finite Automata and Lexical Analysis' in the
-- dragon book for an excellent overview of the algorithms in this
-- module.
--
-- ----------------------------------------------------------------------------}

module DFA(scanner2dfa) where

import AbsSyn
import qualified Map
import qualified Data.IntMap as IntMap
import NFA
import Sort ( msort, nub' )
import CharSet

import Data.Array ( (!) )
import Data.Maybe ( fromJust )

{-                        Defined in the Scan Module

-- (This section should logically belong to the DFA module but it has been
-- placed here to make this module self-contained.)
--  
-- `DFA' provides an alternative to `Scanner' (described in the RExp module);
-- it can be used directly to scan text efficiently.  Additionally it has an
-- extra place holder for holding action functions for generating
-- application-specific tokens.  When this place holder is not being used, the
-- unit type will be used.
--  
-- Each state in the automaton consist of a list of `Accept' values, descending
-- in priority, and an array mapping characters to new states.  As the array
-- may only cover a sub-range of the characters, a default state number is
-- given in the third field.  By convention, all transitions to the -1 state
-- represent invalid transitions.
--  
-- A list of accept states is provided for as the original specification may
-- have been ambiguous, in which case the highest priority token should be
-- taken (the one appearing earliest in the specification); this can not be
-- calculated when the DFA is generated in all cases as some of the tokens may
-- be associated with leading or trailing context or start codes.
--  
-- `scan_token' (see above) can deal with unconditional accept states more
-- efficiently than those associated with context; to save it testing each time
-- whether the list of accept states contains an unconditional state, the flag
-- in the first field of `St' is set to true whenever the list contains an
-- unconditional state.
--  
-- The `Accept' structure contains the priority of the token being accepted
-- (lower numbers => higher priorities), the name of the token, a place holder
-- that can be used for storing the `action' function for constructing the
-- token from the input text and thge scanner's state, a list of start codes
-- (listing the start codes that the scanner must be in for the token to be
-- accepted; empty => no restriction), the leading and trailing context (both
-- `Nothing' if there is none).
--  
-- The leading context consists simply of a character predicate that will
-- return true if the last character read is acceptable.  The trailing context
-- consists of an alternative starting state within the DFA; if this `sub-dfa'
-- turns up any accepting state when applied to the residual input then the
-- trailing context is acceptable (see `scan_token' above).

type DFA a = Array SNum (State a)

type SNum = Int

data State a = St Bool [Accept a] SNum (Array Char SNum)

data Accept a = Acc Int String a [StartCode] (MB(Char->Bool)) (MB SNum)

type StartCode = Int
-}


-- Scanners are converted to DFAs by converting them to NFAs first.  Converting
-- an NFA to a DFA works by identifying the states of the DFA with subsets of
-- the NFA.  The PartDFA is used to construct the DFA; it is essentially a DFA
-- in which the states are represented directly by state sets of the NFA.
-- `nfa2pdfa' constructs the partial DFA from the NFA by searching for all the
-- transitions from a given list of state sets, initially containing the start
-- state of the partial DFA, until all possible state sets have been considered
-- The final DFA is then constructed with a `mk_dfa'.

scanner2dfa:: Encoding -> Scanner -> [StartCode] -> DFA SNum Code
scanner2dfa :: Encoding -> Scanner -> [SNum] -> DFA SNum Code
scanner2dfa Encoding
enc Scanner
scanner [SNum]
scs = [SNum] -> NFA -> DFA SNum Code
nfa2dfa [SNum]
scs (Encoding -> Scanner -> [SNum] -> NFA
scanner2nfa Encoding
enc Scanner
scanner [SNum]
scs)

nfa2dfa:: [StartCode] -> NFA -> DFA SNum Code
nfa2dfa :: [SNum] -> NFA -> DFA SNum Code
nfa2dfa [SNum]
scs NFA
nfa = NFA -> DFA [SNum] Code -> DFA SNum Code
forall a. NFA -> DFA [SNum] a -> DFA SNum a
mk_int_dfa NFA
nfa (NFA -> DFA [SNum] Code -> [[SNum]] -> DFA [SNum] Code
nfa2pdfa NFA
nfa DFA [SNum] Code
forall {a}. DFA [SNum] a
pdfa (DFA [SNum] Any -> [[SNum]]
forall s a. DFA s a -> [s]
dfa_start_states DFA [SNum] Any
forall {a}. DFA [SNum] a
pdfa))
        where
        pdfa :: DFA [SNum] a
pdfa = SNum -> NFA -> DFA [SNum] a
forall a. SNum -> NFA -> DFA [SNum] a
new_pdfa SNum
n_starts NFA
nfa
        n_starts :: SNum
n_starts = [SNum] -> SNum
forall (t :: * -> *) a. Foldable t => t a -> SNum
length [SNum]
scs  -- number of start states

-- `nfa2pdfa' works by taking the next outstanding state set to be considered
-- and and ignoring it if the state is already in the partial DFA, otherwise
-- generating all possible transitions from it, adding the new state to the
-- partial DFA and continuing the closure with the extra states.  Note the way
-- it incorporates the trailing context references into the search (by
-- including `rctx_ss' in the search).

nfa2pdfa:: NFA -> DFA StateSet Code -> [StateSet] -> DFA StateSet Code
nfa2pdfa :: NFA -> DFA [SNum] Code -> [[SNum]] -> DFA [SNum] Code
nfa2pdfa NFA
_   DFA [SNum] Code
pdfa [] = DFA [SNum] Code
pdfa
nfa2pdfa NFA
nfa DFA [SNum] Code
pdfa ([SNum]
ss:[[SNum]]
umkd)
  |  [SNum]
ss [SNum] -> DFA [SNum] Code -> Bool
forall a. [SNum] -> DFA [SNum] a -> Bool
`in_pdfa` DFA [SNum] Code
pdfa =  NFA -> DFA [SNum] Code -> [[SNum]] -> DFA [SNum] Code
nfa2pdfa NFA
nfa DFA [SNum] Code
pdfa  [[SNum]]
umkd
  |  Bool
otherwise         =  NFA -> DFA [SNum] Code -> [[SNum]] -> DFA [SNum] Code
nfa2pdfa NFA
nfa DFA [SNum] Code
pdfa' [[SNum]]
umkd'
  where
        pdfa' :: DFA [SNum] Code
pdfa' = [SNum] -> State [SNum] Code -> DFA [SNum] Code -> DFA [SNum] Code
forall a. [SNum] -> State [SNum] a -> DFA [SNum] a -> DFA [SNum] a
add_pdfa [SNum]
ss ([Accept Code] -> IntMap [SNum] -> State [SNum] Code
forall s a. [Accept a] -> IntMap s -> State s a
State [Accept Code]
accs ([(SNum, [SNum])] -> IntMap [SNum]
forall a. [(SNum, a)] -> IntMap a
IntMap.fromList [(SNum, [SNum])]
ss_outs)) DFA [SNum] Code
pdfa

        umkd' :: [[SNum]]
umkd' = [[SNum]]
rctx_sss [[SNum]] -> [[SNum]] -> [[SNum]]
forall a. [a] -> [a] -> [a]
++ ((SNum, [SNum]) -> [SNum]) -> [(SNum, [SNum])] -> [[SNum]]
forall a b. (a -> b) -> [a] -> [b]
map (SNum, [SNum]) -> [SNum]
forall a b. (a, b) -> b
snd [(SNum, [SNum])]
ss_outs [[SNum]] -> [[SNum]] -> [[SNum]]
forall a. [a] -> [a] -> [a]
++ [[SNum]]
umkd

        -- for each character, the set of states that character would take
        -- us to from the current set of states in the NFA.
        ss_outs :: [(Int, StateSet)]
        ss_outs :: [(SNum, [SNum])]
ss_outs = [ (Byte -> SNum
forall a b. (Integral a, Num b) => a -> b
fromIntegral Byte
ch, NFA -> [SNum] -> [SNum]
mk_ss NFA
nfa [SNum]
ss')
                  | Byte
ch  <- ByteSet -> [Byte]
byteSetElems (ByteSet -> [Byte]) -> ByteSet -> [Byte]
forall a b. (a -> b) -> a -> b
$ [ByteSet] -> ByteSet
forall a. DiscreteOrdered a => [RSet a] -> RSet a
setUnions [ByteSet
p | (ByteSet
p,SNum
_) <- [(ByteSet, SNum)]
outs],
                    let ss' :: [SNum]
ss'  = [ SNum
s' | (ByteSet
p,SNum
s') <- [(ByteSet, SNum)]
outs, ByteSet -> Byte -> Bool
byteSetElem ByteSet
p Byte
ch ],
                    Bool -> Bool
not ([SNum] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null [SNum]
ss')
                  ]

        rctx_sss :: [[SNum]]
rctx_sss = [ NFA -> [SNum] -> [SNum]
mk_ss NFA
nfa [SNum
s]
                   | Acc SNum
_ Maybe Code
_ Maybe CharSet
_ (RightContextRExp SNum
s) <- [Accept Code]
accs ]

        outs :: [(ByteSet,SNum)]
        outs :: [(ByteSet, SNum)]
outs =  [ (ByteSet, SNum)
out | SNum
s <- [SNum]
ss, (ByteSet, SNum)
out <- NState -> [(ByteSet, SNum)]
nst_outs (NFA
nfaNFA -> SNum -> NState
forall i e. Ix i => Array i e -> i -> e
!SNum
s) ]

        accs :: [Accept Code]
accs = [Accept Code] -> [Accept Code]
forall a. [Accept a] -> [Accept a]
sort_accs [Accept Code
acc| SNum
s<-[SNum]
ss, Accept Code
acc<-NState -> [Accept Code]
nst_accs (NFA
nfaNFA -> SNum -> NState
forall i e. Ix i => Array i e -> i -> e
!SNum
s)]

-- `sort_accs' sorts a list of accept values into decending order of priority,
-- eliminating any elements that follow an unconditional accept value.

sort_accs:: [Accept a] -> [Accept a]
sort_accs :: forall a. [Accept a] -> [Accept a]
sort_accs [Accept a]
accs = (Accept a -> [Accept a] -> [Accept a])
-> [Accept a] -> [Accept a] -> [Accept a]
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Accept a -> [Accept a] -> [Accept a]
forall {a}. Accept a -> [Accept a] -> [Accept a]
chk [] ((Accept a -> Accept a -> Bool) -> [Accept a] -> [Accept a]
forall a. (a -> a -> Bool) -> [a] -> [a]
msort Accept a -> Accept a -> Bool
forall {a} {a}. Accept a -> Accept a -> Bool
le [Accept a]
accs)
        where
        chk :: Accept a -> [Accept a] -> [Accept a]
chk acc :: Accept a
acc@(Acc SNum
_ Maybe a
_ Maybe CharSet
Nothing RightContext SNum
NoRightContext) [Accept a]
_   = [Accept a
acc]
        chk Accept a
acc                                  [Accept a]
rst = Accept a
accAccept a -> [Accept a] -> [Accept a]
forall a. a -> [a] -> [a]
:[Accept a]
rst

        le :: Accept a -> Accept a -> Bool
le (Acc{accPrio :: forall a. Accept a -> SNum
accPrio = SNum
n}) (Acc{accPrio :: forall a. Accept a -> SNum
accPrio=SNum
n'}) = SNum
nSNum -> SNum -> Bool
forall a. Ord a => a -> a -> Bool
<=SNum
n'



{------------------------------------------------------------------------------
                          State Sets and Partial DFAs
------------------------------------------------------------------------------}



-- A `PartDFA' is a partially constructed DFA in which the states are
-- represented by sets of states of the original NFA.  It is represented by a
-- triple consisting of the start state of the partial DFA, the NFA from which
-- it is derived and a map from state sets to states of the partial DFA.  The
-- state set for a given list of NFA states is calculated by taking the epsilon
-- closure of all the states, sorting the result with duplicates eliminated.

type StateSet = [SNum]

new_pdfa:: Int -> NFA -> DFA StateSet a
new_pdfa :: forall a. SNum -> NFA -> DFA [SNum] a
new_pdfa SNum
starts NFA
nfa
 = DFA :: forall s a. [s] -> Map s (State s a) -> DFA s a
DFA { dfa_start_states :: [[SNum]]
dfa_start_states = [[SNum]]
start_ss,
         dfa_states :: Map [SNum] (State [SNum] a)
dfa_states = Map [SNum] (State [SNum] a)
forall k a. Map k a
Map.empty
       }
 where
        start_ss :: [[SNum]]
start_ss = [ (SNum -> SNum -> Bool) -> [SNum] -> [SNum]
forall a. (a -> a -> Bool) -> [a] -> [a]
msort SNum -> SNum -> Bool
forall a. Ord a => a -> a -> Bool
(<=) (NState -> [SNum]
nst_cl(NFA
nfaNFA -> SNum -> NState
forall i e. Ix i => Array i e -> i -> e
!SNum
n)) | SNum
n <- [SNum
0..(SNum
startsSNum -> SNum -> SNum
forall a. Num a => a -> a -> a
-SNum
1)]]

 -- starts is the number of start states

-- constructs the epsilon-closure of a set of NFA states
mk_ss:: NFA -> [SNum] -> StateSet
mk_ss :: NFA -> [SNum] -> [SNum]
mk_ss NFA
nfa [SNum]
l = (SNum -> SNum -> Bool) -> [SNum] -> [SNum]
forall a. (a -> a -> Bool) -> [a] -> [a]
nub' SNum -> SNum -> Bool
forall a. Ord a => a -> a -> Bool
(<=) [SNum
s'| SNum
s<-[SNum]
l, SNum
s'<-NState -> [SNum]
nst_cl(NFA
nfaNFA -> SNum -> NState
forall i e. Ix i => Array i e -> i -> e
!SNum
s)]

add_pdfa:: StateSet -> State StateSet a -> DFA StateSet a -> DFA StateSet a
add_pdfa :: forall a. [SNum] -> State [SNum] a -> DFA [SNum] a -> DFA [SNum] a
add_pdfa [SNum]
ss State [SNum] a
pst (DFA [[SNum]]
st Map [SNum] (State [SNum] a)
mp) = [[SNum]] -> Map [SNum] (State [SNum] a) -> DFA [SNum] a
forall s a. [s] -> Map s (State s a) -> DFA s a
DFA [[SNum]]
st ([SNum]
-> State [SNum] a
-> Map [SNum] (State [SNum] a)
-> Map [SNum] (State [SNum] a)
forall k a. Ord k => k -> a -> Map k a -> Map k a
Map.insert [SNum]
ss State [SNum] a
pst Map [SNum] (State [SNum] a)
mp)

in_pdfa:: StateSet -> DFA StateSet a -> Bool
in_pdfa :: forall a. [SNum] -> DFA [SNum] a -> Bool
in_pdfa [SNum]
ss (DFA [[SNum]]
_ Map [SNum] (State [SNum] a)
mp) = [SNum]
ss [SNum] -> Map [SNum] (State [SNum] a) -> Bool
forall k a. Ord k => k -> Map k a -> Bool
`Map.member` Map [SNum] (State [SNum] a)
mp

-- Construct a DFA with numbered states, from a DFA whose states are
-- sets of states from the original NFA.

mk_int_dfa:: NFA -> DFA StateSet a -> DFA SNum a
mk_int_dfa :: forall a. NFA -> DFA [SNum] a -> DFA SNum a
mk_int_dfa NFA
nfa (DFA [[SNum]]
start_states Map [SNum] (State [SNum] a)
mp)
  = [SNum] -> Map SNum (State SNum a) -> DFA SNum a
forall s a. [s] -> Map s (State s a) -> DFA s a
DFA [SNum
0 .. [[SNum]] -> SNum
forall (t :: * -> *) a. Foldable t => t a -> SNum
length [[SNum]]
start_statesSNum -> SNum -> SNum
forall a. Num a => a -> a -> a
-SNum
1] 
        ([(SNum, State SNum a)] -> Map SNum (State SNum a)
forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList [ ([SNum] -> SNum
lookup' [SNum]
st, State [SNum] a -> State SNum a
forall a. State [SNum] a -> State SNum a
cnv State [SNum] a
pds) | ([SNum]
st, State [SNum] a
pds) <- Map [SNum] (State [SNum] a) -> [([SNum], State [SNum] a)]
forall k a. Map k a -> [(k, a)]
Map.toAscList Map [SNum] (State [SNum] a)
mp ])
  where
        mp' :: Map [SNum] SNum
mp' = [([SNum], SNum)] -> Map [SNum] SNum
forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList ([[SNum]] -> [SNum] -> [([SNum], SNum)]
forall a b. [a] -> [b] -> [(a, b)]
zip ([[SNum]]
start_states [[SNum]] -> [[SNum]] -> [[SNum]]
forall a. [a] -> [a] -> [a]
++ 
                                 ((([SNum], State [SNum] a) -> [SNum])
-> [([SNum], State [SNum] a)] -> [[SNum]]
forall a b. (a -> b) -> [a] -> [b]
map ([SNum], State [SNum] a) -> [SNum]
forall a b. (a, b) -> a
fst ([([SNum], State [SNum] a)] -> [[SNum]])
-> (Map [SNum] (State [SNum] a) -> [([SNum], State [SNum] a)])
-> Map [SNum] (State [SNum] a)
-> [[SNum]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Map [SNum] (State [SNum] a) -> [([SNum], State [SNum] a)]
forall k a. Map k a -> [(k, a)]
Map.toAscList) (([SNum]
 -> Map [SNum] (State [SNum] a) -> Map [SNum] (State [SNum] a))
-> Map [SNum] (State [SNum] a)
-> [[SNum]]
-> Map [SNum] (State [SNum] a)
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr [SNum]
-> Map [SNum] (State [SNum] a) -> Map [SNum] (State [SNum] a)
forall k a. Ord k => k -> Map k a -> Map k a
Map.delete Map [SNum] (State [SNum] a)
mp [[SNum]]
start_states)) [SNum
0..])

        lookup' :: [SNum] -> SNum
lookup' = Maybe SNum -> SNum
forall a. HasCallStack => Maybe a -> a
fromJust (Maybe SNum -> SNum) -> ([SNum] -> Maybe SNum) -> [SNum] -> SNum
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([SNum] -> Map [SNum] SNum -> Maybe SNum)
-> Map [SNum] SNum -> [SNum] -> Maybe SNum
forall a b c. (a -> b -> c) -> b -> a -> c
flip [SNum] -> Map [SNum] SNum -> Maybe SNum
forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup Map [SNum] SNum
mp'

        cnv :: State StateSet a -> State SNum a
        cnv :: forall a. State [SNum] a -> State SNum a
cnv (State [Accept a]
accs IntMap [SNum]
as) = [Accept a] -> IntMap SNum -> State SNum a
forall s a. [Accept a] -> IntMap s -> State s a
State [Accept a]
accs' IntMap SNum
as'
                where
                as' :: IntMap SNum
as'   = (SNum -> [SNum] -> SNum) -> IntMap [SNum] -> IntMap SNum
forall a b. (SNum -> a -> b) -> IntMap a -> IntMap b
IntMap.mapWithKey (\SNum
_ch [SNum]
s -> [SNum] -> SNum
lookup' [SNum]
s) IntMap [SNum]
as

                accs' :: [Accept a]
accs' = (Accept a -> Accept a) -> [Accept a] -> [Accept a]
forall a b. (a -> b) -> [a] -> [b]
map Accept a -> Accept a
forall {a}. Accept a -> Accept a
cnv_acc [Accept a]
accs
                cnv_acc :: Accept a -> Accept a
cnv_acc (Acc SNum
p Maybe a
a Maybe CharSet
lctx RightContext SNum
rctx) = SNum -> Maybe a -> Maybe CharSet -> RightContext SNum -> Accept a
forall a.
SNum -> Maybe a -> Maybe CharSet -> RightContext SNum -> Accept a
Acc SNum
p Maybe a
a Maybe CharSet
lctx RightContext SNum
rctx'
                  where rctx' :: RightContext SNum
rctx' = 
                          case RightContext SNum
rctx of
                                RightContextRExp SNum
s -> 
                                  SNum -> RightContext SNum
forall r. r -> RightContext r
RightContextRExp ([SNum] -> SNum
lookup' (NFA -> [SNum] -> [SNum]
mk_ss NFA
nfa [SNum
s]))
                                RightContext SNum
other -> RightContext SNum
other

{-

-- `mk_st' constructs a state node from the list of accept values and a list of
-- transitions.  The transitions list all the valid transitions out of the
-- node; all invalid transitions should be represented in the array by state
-- -1.  `mk_st' has to work out whether the accept states contain an
-- unconditional entry, in which case the first field of `St' should be true,
-- and which default state to use in constructing the array (the array may span
-- a sub-range of the character set, the state number given the third argument
-- of `St' being taken as the default if an input character lies outside the
-- range).  The default values is chosen to minimise the bounds of the array
-- and so there are two candidates: the value that 0 maps to (in which case
-- some initial segment of the array may be omitted) or the value that 255 maps
-- to (in which case a final segment of the array may be omitted), hence the
-- calculation of `(df,bds)'.
--  
-- Note that empty arrays are avoided as they can cause severe problems for
-- some popular Haskell compilers.

mk_st:: [Accept Code] -> [(Char,Int)] -> State Code
mk_st accs as =
        if null as
           then St accs (-1) (listArray ('0','0') [-1])
           else St accs df (listArray bds [arr!c| c<-range bds])
        where
        bds = if sz==0 then ('0','0') else bds0

        (sz,df,bds0) | sz1 < sz2 = (sz1,df1,bds1)
                     | otherwise = (sz2,df2,bds2)

        (sz1,df1,bds1) = mk_bds(arr!chr 0)
        (sz2,df2,bds2) = mk_bds(arr!chr 255)

        mk_bds df = (t-b, df, (chr b, chr (255-t)))
                where
                b = length (takeWhile id [arr!c==df| c<-['\0'..'\xff']])
                t = length (takeWhile id [arr!c==df| c<-['\xff','\xfe'..'\0']])

        arr = listArray ('\0','\xff') (take 256 (repeat (-1))) // as
-}