----------------------------------------------------------------------------
-- |
-- Module      :  FastTags.LexerM
-- Copyright   :  (c) Sergey Vinokurov 2019
--
-- All the types and functions needed to make lexer run:
-- - 'AlexInput' - primary workhorse, an optimized representation of input
--   stream as a pointer to utf8 bytes and our position within it.
-- - Lexer monad 'AlexM' - a monad (self-explanatory) with state that describes
--   current lexing context.
-- - 'AlexState' - state of the lexing monad, maintains current Alex code,
--   comment depth, quasiquoter depth, indentation size, whether we're in
--   a literate mode (and in which one) or vanilla mode and whether there
--   are any TH quasiquotes present till the end of file.
--
-- All the functions are to do with
----------------------------------------------------------------------------

{-# LANGUAGE BangPatterns               #-}
{-# LANGUAGE ConstraintKinds            #-}
{-# LANGUAGE DeriveFunctor              #-}
{-# LANGUAGE FlexibleContexts           #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE LambdaCase                 #-}
{-# LANGUAGE MagicHash                  #-}
{-# LANGUAGE MultiWayIf                 #-}
{-# LANGUAGE NamedFieldPuns             #-}
{-# LANGUAGE OverloadedStrings          #-}
{-# LANGUAGE RankNTypes                 #-}
{-# LANGUAGE ScopedTypeVariables        #-}
{-# LANGUAGE UnboxedTuples              #-}

module FastTags.LexerM
    ( AlexState(..)
    , mkAlexState
    , alexEnterBirdLiterateEnv
    , alexEnterLiterateLatexEnv
    , alexExitLiterateEnv
    , pushContext
    , modifyCommentDepth
    , modifyQuasiquoterDepth
    , modifyPreprocessorDepth
    , addIndentationSize
    , checkQuasiQuoteEndPresent

    , AlexM
    , runAlexM
    , alexSetInput
    , alexSetNextCode

    , AlexInput(..)
    , aiLineL
    , takeText
    , countInputSpace
    , extractDefineOrLetName
    , dropUntilNL
    , dropUntilUnescapedNL
    , dropUntilNLOr
    , dropUntilNLOrEither
    , unsafeTextHeadAscii
    , unsafeTextHeadOfTailAscii
    , unsafeTextHead
    , utf8BS

    , asCodeL
    , asCommentDepthL
    , asQuasiquoterDepthL
    , asIndentationSizeL
    , asPreprocessorDepthL
    , asLiterateLocL
    , asHaveQQEndL

      -- * Alex interface
    , alexInputPrevChar
    , alexGetByte
    ) where

import Control.Applicative as A
import Control.DeepSeq
import Control.Exception
import Control.Monad.ST
import Control.Monad.State.Strict
import Control.Monad.Writer.Strict

import Data.Char
import Data.Int
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import Data.Void (Void, vacuous)

import qualified Data.ByteString as BS
import qualified Data.ByteString.Char8 as C8
import qualified Data.ByteString.Internal as BSI
import qualified Data.Vector.Unboxed as U
import qualified Data.Vector.Unboxed.Mutable as UM

import Foreign.ForeignPtr
import Foreign.Ptr
import GHC.Base
import GHC.Ptr
import GHC.Word
import Text.Printf

import FastTags.LensBlaze
import FastTags.LexerTypes
import FastTags.Token

import GHC.Exts (word8ToWord#, wordToWord8#)


data AlexState = AlexState
    { asInput        :: {-# UNPACK #-} !AlexInput
    , asIntStore     :: {-# UNPACK #-} !Word64
        -- ^ Integer field that stores all the other useful fields for lexing.
    , asContextStack :: [Context]
    } deriving (Show, Eq, Ord)


checkQuasiQuoteEndPresent :: Ptr Word8 -> Bool
checkQuasiQuoteEndPresent
    = (\x -> isTrue# (qqessPresent x))
    . utf8Foldl' combine (QQEndsState 0# '\n'#)
    where
    combine :: QQEndsState -> Char# -> QQEndsState
    combine QQEndsState{qqessPresent, qqessPrevChar} c# = QQEndsState
        { qqessPresent      =
          qqessPresent `orI#`
          case (# qqessPrevChar, c# #) of
              (# '|'#, ']'# #) -> 1#
              (# _,    '⟧'# #) -> 1#
              _                -> 0#
        , qqessPrevChar = c#
        }

countInputSpace :: AlexInput -> Int -> Int
countInputSpace AlexInput{aiPtr} len =
    utf8FoldlBounded len inc 0 aiPtr
    where
    inc !acc ' '#  = acc + 1
    inc !acc '\t'# = acc + 8
    inc !acc c#    = case fixChar c# of
        1## -> acc + 1
        _   -> acc

-- Translate unicode character into special symbol we teached Alex to recognize.
{-# INLINE fixChar #-}
fixChar :: Char# -> Word#
fixChar = \case
    -- These should not be translated since Alex knows about them
    '→'#    -> reservedSym
    '∷'#    -> reservedSym
    '⇒'#    -> reservedSym
    '∀'#    -> reservedSym
    '⦇'#    -> reservedSym
    '⦈'#    -> reservedSym
    '⟦'#    -> reservedSym
    '⟧'#    -> reservedSym
    '\x00'# -> fullStop
    '\x01'# -> fullStop
    '\x02'# -> fullStop
    '\x03'# -> fullStop
    '\x04'# -> fullStop
    '\x05'# -> fullStop
    '\x06'# -> fullStop
    '\x07'# -> fullStop
    '\x08'# -> other
    c# -> case ord# c# of
        c2# | isTrue# (c2# <=# 0x7f#) ->
              int2Word# c2# -- Plain ascii needs no fixing.
            | otherwise   ->
                case generalCategory (C# c#) of
                    UppercaseLetter      -> upper
                    LowercaseLetter      -> lower
                    TitlecaseLetter      -> upper
                    ModifierLetter       -> suffix
                    OtherLetter          -> lower
                    NonSpacingMark       -> suffix
                    DecimalNumber        -> digit
                    OtherNumber          -> digit
                    Space                -> space
                    ConnectorPunctuation -> symbol
                    DashPunctuation      -> symbol
                    OtherPunctuation     -> symbol
                    MathSymbol           -> symbol
                    CurrencySymbol       -> symbol
                    ModifierSymbol       -> symbol
                    OtherSymbol          -> symbol

                    SpacingCombiningMark -> space
                    EnclosingMark        -> other
                    LetterNumber         -> symbol
                    OpenPunctuation      -> symbol
                    ClosePunctuation     -> symbol
                    InitialQuote         -> symbol
                    FinalQuote           -> symbol
                    LineSeparator        -> space
                    ParagraphSeparator   -> space
                    Control              -> other
                    Format               -> other
                    Surrogate            -> other
                    PrivateUse           -> other
                    NotAssigned          -> other
    where
      fullStop, space, upper, lower, symbol :: Word#
      digit, suffix, reservedSym, other :: Word#
      fullStop    = 0x00## -- Don't care about these
      space       = 0x01##
      upper       = 0x02##
      lower       = 0x03##
      symbol      = 0x04##
      digit       = 0x05##
      suffix      = 0x06##
      reservedSym = 0x07##
      other       = 0x08##