{-# LANGUAGE ScopedTypeVariables, DeriveGeneric, DeriveAnyClass
  , OverloadedStrings #-}
{-|
  Module      : Text.ANTLR.Lex.Tokenizer
  Description : Tokenization algorithms
  Copyright   : (c) Karl Cronburg, 2018
  License     : BSD3
  Maintainer  : karl@cs.tufts.edu
  Stability   : experimental
  Portability : POSIX

-}
module Text.ANTLR.Lex.Tokenizer where
import Text.ANTLR.Lex.Automata
import Text.ANTLR.Lex.DFA

import qualified Text.ANTLR.Set as Set
import Text.ANTLR.Set (Hashable, member, Generic(..), Set(..))

import Text.ANTLR.Pretty
import qualified Debug.Trace as D
import Data.List (find)
import qualified Data.Text as T

-- | Token with names @n@, values @v@, and number of input symbols consumed to match
--   it.
data Token n v =
    Token n v Int  -- ^ Tokenized a token
  | EOF            -- ^ The end-of-file token
  | Error T.Text   -- ^ Error encountered while tokenizing
  deriving (Show, Ord, Generic, Hashable)

instance (Prettify n, Prettify v) => Prettify (Token n v) where
  prettify EOF = pStr "EOF"
  prettify (Error s) = pStr "Token Error: " >> pStr s
  prettify (Token n v i) =
    prettify v

instance Eq n => Eq (Token n v) where
  Token s _ _ == Token s1 _ _ = s == s1
  EOF         == EOF          = True
  Error s     == Error s1     = s == s1
  _           == _            = False

-- | Token Names are Input Symbols to the parser.
tokenName :: Token n v -> n
tokenName (Token n v _) = n

-- | Get the value of a token, ignoring its name.
tokenValue :: Token n v -> v
tokenValue (Token n v _) = v

-- | Get the number of characters from the input that this token matched on.
tokenSize :: Token n v -> Int
tokenSize (Token _ _ i) = i
tokenSize EOF = 0

-- | A Lexeme is a sequence of zero or more (matched) input symbols
type Lexeme s = [s]

-- | A named DFA over symbols @s@, indices @i@, and names @n@.
type NDFA s i n = (n, DFA s i)

-- | Entrypoint for tokenizing an input stream given a list of named DFAs that
--   we can match on.
--   
--   > @dfaTuples@: converts from DFAs to the names associated with them in
--     the specification of the lexer.
--
--   > @fncn@: function for constructing the value of a token from the lexeme
--     matched (e.g. @varName@) and the associated token name (e.g. @id@)
--
tokenize ::
  forall s i n v. (Eq i, Ord s, Eq s, Show s, Show i, Show n, Show v, Hashable i, Hashable s)
  => [(n, DFA s i)]       -- ^ Association list of named DFAs.
  -> (Lexeme s -> n -> v) -- ^ Constructs the value of a token from lexeme matched.
  -> [s]                  -- ^ The input string.
  -> [Token n v]          -- ^ The tokenized tokens.
tokenize dfaTuples fncn input0 = let

    dfas0 = map snd dfaTuples

    allTok :: [(NDFA s i n, State i)] -> [s] -> [Token n v]
    allTok dfaSims0 currInput = let
        oneTok :: [(NDFA s i n, State i)] -> [s] -> Maybe (Lexeme s, NDFA s i n)
        oneTok dfaSims []     = Nothing
        oneTok []      ss     = Nothing
        oneTok dfaSims (s:ss) = let
            dfaSims' =
              [ ((n, dfa), stop)
              | ((n, dfa), cursor)     <- dfaSims
              , (start, es, stop) <- Set.toList $ _Δ dfa
              , start == cursor && s `edgeMember` es ]

            accepting = [ (n,dfa) | ((n, dfa), cursor) <- dfaSims', cursor `member` _F dfa ]

          in (case (oneTok dfaSims' ss, accepting) of
              (Nothing, [])   -> Nothing
              (Nothing, d:ds) -> Just ([s], d)
              (Just (l,d), _) -> Just (s:l, d))
      in case (currInput, oneTok dfaSims0 currInput) of
          ([], _)       -> [EOF]
          (ss, Nothing) -> [Error $ T.pack $ show ss]
          (ss, Just (l, (name,d))) ->
            Token name (fncn l name) (length l)
            : allTok dfaSims0 (drop (length l) currInput)
  in allTok (zip dfaTuples (map s0 dfas0)) input0

-- | Incremental tokenizer takes in the same list of DFAs and AST value
--   constructor function, but instead returns an incremental tokenizer function
--   that expects a set of names that we currently expect to tokenize on,
--   the current input stream, and returns a single tokenized token along
--   with the modified input stream to iteratively call 'tokenizeInc' on.
tokenizeInc
  :: forall s i n v. (Eq i, Ord s, Eq n, Eq s, Show s, Show i, Show n, Show v, Hashable i, Hashable s, Hashable n)
  => (n -> Bool)                         -- ^ Function that returns True on DFA names we wish to filter __out__ of the results.
  -> [(n, DFA s i)]                      -- ^ Closure over association list of named DFAs.
  -> (Lexeme s -> n -> v)                -- ^ Token value constructor from lexemes.
  -> (Set n -> [s] -> (Token n v, [s]))  -- ^ The incremental tokenizer closure.
tokenizeInc filterF dfaTuples fncn = let

    tI :: Set n -> [s] -> (Token n v, [s])
    tI ns input = let
        
        dfaTuples'  = filter (\(n,_) -> n `Set.member` ns || filterF n) dfaTuples
        tokenized   = tokenize dfaTuples' fncn input
        
        filterF' (Token n _ _) = filterF n
        filterF' _             = False
        
        ignored     = takeWhile filterF' tokenized
        nextTokens  = dropWhile filterF' tokenized
        -- Yayy lazy function evaluation.
        next = case nextTokens of
                []     -> EOF
                (t:_)  -> t --D.traceShowId t
      
      in (next, drop (sum $ map tokenSize $ next : ignored) input)
  in tI