{-# LANGUAGE CPP #-}
{-# LANGUAGE BangPatterns #-}
-- | This module is separate from the Lexer.x input to Alex
-- to segregate the automatically generated code from the
-- hand written code. The automatically generated code
-- causes lots of warnings which mask the interesting warnings.
module Config.LexerUtils where

import Data.Char            (GeneralCategory(..), generalCategory, digitToInt,
                             isAscii, isSpace, readLitChar, ord)
import Data.Monoid          ((<>))
import Data.Text            (Text)
import Data.Text.Lazy.Builder (Builder)
import Data.Word            (Word8)
import Numeric              (readInt)
import qualified Data.Text                  as Text
import qualified Data.Text.Lazy             as LText
import qualified Data.Text.Lazy.Builder     as Builder

#if !MIN_VERSION_base(4,8,0)
import Data.Monoid (mempty)
import Data.Functor ((<$))
#endif

import Config.Tokens

------------------------------------------------------------------------
-- Custom Alex wrapper
------------------------------------------------------------------------

type AlexInput = Located Text

alexStartPos :: Position
alexStartPos = Position { posLine = 1, posColumn = 1 }

alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
alexGetByte (Located p cs)
  = do (c,!cs') <- Text.uncons cs
       let !p' = alexMove p c
           !b = byteForChar c
       return (b, Located p' cs')

alexMove :: Position -> Char -> Position
alexMove (Position line column) c =
  case c of
    '\t' -> Position line (((column + 7) `div` 8) * 8 + 1)
    '\n' -> Position (line + 1) 1
    _    -> Position line (column + 1)

------------------------------------------------------------------------
-- Lexer Modes
------------------------------------------------------------------------

data LexerMode
  = InNormal
  | InComment !CommentType !Position [Position]
  | InString !(Located Builder)

data CommentType = CommentState | StringState

--  token starting position -> token bytes -> lexer state -> (new state, token)
type Action = Located Text -> LexerMode -> (LexerMode, Maybe (Located Token))

-- Helper function for building an Action given a token constructor
-- function, a position, and the matched token.
tok ::
  (Text -> Token) {- ^ token constructor -} ->
  Action
tok f match st = (st, Just (fmap f match))

simpleAction :: (Located Text -> LexerMode -> LexerMode) -> Action
simpleAction f match st = (f match st, Nothing)

startComment :: Action
startComment = simpleAction $ \match st ->
  case st of
    InNormal                -> InComment CommentState (locPosition match) []
    InComment _ posn1 posns -> InComment CommentState (locPosition match) (posn1:posns)
    _                       -> error "startComment: Lexer failure"

startCommentString :: Action
startCommentString = simpleAction $ \match st ->
  case st of
    InComment _ posn1 posns -> InComment StringState (locPosition match) (posn1:posns)
    _                       -> error "startCommentString: Lexer failure"

endCommentString :: Action
endCommentString = simpleAction $ \_ st ->
  case st of
    InComment _ _ (posn:posns) -> InComment CommentState posn posns
    InComment _ _ []           -> InNormal
    _                          -> error "endComment: Lexer failure"

-- | Enter the string literal lexer
startString :: Action
startString = simpleAction $ \match _ -> InString (mempty <$ match)

-- | Emit completed string literal, exit string literal lexer and return to
-- Normal mode.
endString :: Action
endString _ st =
  case st of
    InString builder ->
      let !t = fmap (LText.toStrict . Builder.toLazyText) builder
      in (InNormal, Just (fmap String t))
    _ -> error "endString: Lexer failure"

-- | Add region of text to current string literal state. Escapes are handled
-- separately.
addString :: Action
addString = simpleAction $ \match st ->
  case st of
    InString builder ->
      InString (fmap (<> Builder.fromText (locThing match)) builder)
    _ -> error "addString: Lexer failure"

-- | Handle character escapes in string literal mode
addCharLit :: Action
addCharLit = simpleAction $ \match st ->
  case (st, readLitChar (Text.unpack (locThing match))) of
    (InString builder, [(c,"")]) ->
         InString (fmap (<> Builder.singleton c) builder)
    _ -> error "addCharLit: Lexer failure"

-- | Construct a 'Number' token from a token using a
-- given base. This function expect the token to be
-- legal for the given base. This is checked by Alex.
number ::
  Int  {- ^ prefix length      -} ->
  Int  {- ^ base               -} ->
  Text {- ^ sign-prefix-digits -} ->
  Token
number prefixLen base str =
  case readInt (fromIntegral base) (const True) digitToInt str2 of
    [(n,"")] -> Number base (s*n)
    _        -> Error
  where
  str2     = drop prefixLen str1
  (s,str1) = case Text.unpack str of
               '-':rest -> (-1, rest)
               rest     -> ( 1, rest)

-- | Process a section heading token
section :: Text -> Token
section = Section . Text.dropWhileEnd isSpace . Text.init

------------------------------------------------------------------------
-- Embed all of unicode, kind of, in a single byte!
------------------------------------------------------------------------

byteForChar :: Char -> Word8
byteForChar c
  | c <= '\6' = non_graphic
  | isAscii c = fromIntegral (ord c)
  | otherwise = case generalCategory c of
                  LowercaseLetter       -> lower
                  OtherLetter           -> lower
                  UppercaseLetter       -> upper
                  TitlecaseLetter       -> upper
                  DecimalNumber         -> digit
                  OtherNumber           -> digit
                  ConnectorPunctuation  -> symbol
                  DashPunctuation       -> symbol
                  OtherPunctuation      -> symbol
                  MathSymbol            -> symbol
                  CurrencySymbol        -> symbol
                  ModifierSymbol        -> symbol
                  OtherSymbol           -> symbol
                  Space                 -> space
                  ModifierLetter        -> other
                  NonSpacingMark        -> other
                  SpacingCombiningMark  -> other
                  EnclosingMark         -> other
                  LetterNumber          -> other
                  OpenPunctuation       -> other
                  ClosePunctuation      -> other
                  InitialQuote          -> other
                  FinalQuote            -> other
                  _                     -> non_graphic
  where
  non_graphic     = 0
  upper           = 1
  lower           = 2
  digit           = 3
  symbol          = 4
  space           = 5
  other           = 6