{-# LANGUAGE CPP #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE Rank2Types #-}
{-# LANGUAGE NoMonomorphismRestriction #-}

-- | Lex GLua into MTokens
module GLua.Lexer where

import GLua.AG.Token (MToken (..), Region (..), Token (..))

import Data.Char (ord)
import Text.ParserCombinators.UU (
  ExtAlternative (opt, (<<|>)),
  P,
  micro,
  pEnd,
  pErrors,
  pMany,
  pPos,
  pReturn,
  pSome,
  parse,
  (<$$>),
  (<**>),
 )
import Text.ParserCombinators.UU.BasicInstances (
  Error,
  Insertion (Insertion),
  LineColPos (..),
  Str,
  createStr,
  pMunch,
  pSatisfy,
  pSym,
  pToken,
 )
import Text.ParserCombinators.UU.Utils (pDigit)

-- | String parser that maintains positions.
type LParser a = P (Str Char String LineColPos) a

-- | Whitespace parser that requires at least one whitespace character
parseWhitespace :: LParser String
parseWhitespace = pSome $ pSatisfy (`elem` " \r\n\t") (Insertion "Whitespace" ' ' 5)

-- | Whitespace parser that requires 0 or more whitespace characters
parseOptionalWhitespace :: LParser String
parseOptionalWhitespace = pMany $ pSatisfy (`elem` " \r\n\t") (Insertion "Whitespace" ' ' 5)

-- | Blanco parser. Parses anything. Used in parsing comments.
parseAnyChar :: LParser Char
parseAnyChar = pSatisfy (const True) (Insertion "Any character" 'y' 5)

-- See luajit's src/lj_char.c and src/lj_char.h
pIdentifierCharacter :: LParser Char
pIdentifierCharacter = pSatisfy validChar (Insertion "Identifying character (letter, number, emoji)" 'a' 5)
  where
    validChar :: Char -> Bool
    validChar c =
      between c '0' '9'
        || between c 'A' 'Z'
        || c == '_'
        || between c 'a' 'z'
        || ord c >= 128

pNonDigitIdentifierCharacter :: LParser Char
pNonDigitIdentifierCharacter = pSatisfy validChar (Insertion "Identifying character (letter, emoji)" 'a' 5)
  where
    validChar :: Char -> Bool
    validChar c =
      between c 'A' 'Z'
        || c == '_'
        || between c 'a' 'z'
        || ord c >= 128

between :: Char -> Char -> Char -> Bool
between c left right = c >= left && c <= right

-- | Parses a C-style block comment.
parseCBlockComment :: LParser String
parseCBlockComment =
  const ""
    <$> pToken "*/"
      <<|> (:)
    <$> parseAnyChar
    <*> parseCBlockComment

-- | Try to parse a block comment.
-- Might actually return a single line Dash comment, because for example
-- the following line is a line comment, rather than a block comment
-- [===== <- missing the last '[' bracket.
parseBlockComment :: LParser Token
parseBlockComment = pToken "[" *> nested 0
  where
    -- The amount of =-signs in the string delimiter is n
    nested :: Int -> LParser Token
    nested n =
      pToken "="
        *> nested (n + 1)
          <<|> DashBlockComment n
        <$ pToken "["
        <*> restString n
          <<|> lineComment n
        <$> pUntilEnd

    -- Turns out we were describing a line comment all along, cheeky bastard!
    -- (the last [ of the block comment start token is missing)
    lineComment :: Int -> String -> Token
    lineComment n str = DashComment $ '[' : replicate n '=' ++ str

    -- Right-recursive grammar. This part searches for the rest of the string until it finds the ]=^n] token
    restString :: Int -> LParser String
    restString n =
      const ""
        <$> pToken ("]" ++ replicate n '=' ++ "]")
          <<|> (:)
        <$> parseAnyChar
        <*> restString n

-- | Parse the string until the end. Used in parseLineComment among others.
pUntilEnd :: LParser String
pUntilEnd = pMunch (\c -> c /= '\n' && c /= '\r')

-- | A comment that spans until the end of the line.
parseLineComment :: String -> LParser String
parseLineComment prefix = flip const <$> pToken prefix <*> pUntilEnd

-- | Parses a multiline string except for its first character (e.g. =[ string ]=])
-- This is because the first [ could also just be parsed as a square bracket.
nestedString :: LParser String
nestedString = nested 0
  where
    -- The amount of =-signs in the string delimiter is n
    nested :: Int -> LParser String
    nested n =
      (\str -> "=" ++ str)
        <$ pToken "="
        <*> nested (n + 1)
          <<|> ('[' :)
        <$ pToken "["
        <*> restString n

    -- Right-recursive grammar. This part searches for the rest of the string until it finds the ]=^n] token
    restString :: Int -> LParser String
    restString n =
      pToken ("]" ++ replicate n '=' ++ "]")
        <<|> (:)
        <$> parseAnyChar
        <*> restString n

-- | Parse any kind of comment.
parseComment :: LParser Token
parseComment =
  pToken "--"
    <**> (const <$> (parseBlockComment <<|> DashComment <$> pUntilEnd)) -- Dash block comment and dash comment both start with "--"
    <<|> pToken "/"
    <**> ( const
            <$> ( SlashBlockComment
                    <$ pToken "*"
                    <*> parseCBlockComment
                      <<|> SlashComment
                    <$> parseLineComment "/"
                      <<|> pReturn Divide -- The /-sign is here because it also starts with '/'
                )
         )

-- | Parse single line strings e.g. "sdf", 'werf'.
parseLineString :: Char -> LParser String
parseLineString c = pSym c *> innerString
  where
    innerString :: LParser String
    innerString =
      pSym '\\'
        <**> ((\c' str esc -> esc : c' ++ str) <$> escapeSequence <*> innerString) -- Escaped character in string always starts with backslash
        <<|> const ""
        <$> pSym c
          <<|> (:)
        <$> pNoNewline
        <*> innerString -- the end of the string
        -- the next character in the string
    escapeSequence :: LParser String
    escapeSequence = (:) <$> pSym 'z' <*> parseOptionalWhitespace <<|> (: []) <$> parseAnyChar

    pNoNewline :: LParser Char
    pNoNewline = pSatisfy (/= '\n') (Insertion "Anything but a newline" c 5)

-- | Single and multiline strings.
parseString :: LParser Token
parseString =
  DQString
    <$> parseLineString '"'
      <<|> SQString
    <$> parseLineString '\''
      <<|>
      -- Parse either a multiline string or just a bracket.
      -- Placed here because they have the first token '[' in common
      pSym '['
      <**> ( (\_ -> MLString . (:) '[')
              <$$> nestedString
              <<|> const
              <$> pReturn LSquare
           )

-- | Parse any kind of number.
parseNumber :: LParser Token
parseNumber = TNumber <$> ((++) <$> (pZeroPrefixedNumber <<|> pNumber) <*> (pLLULL <<|> opt parseNumberSuffix ""))
  where
    -- Numbers starting with 0 can be regular numbers, hexadecimals or binary
    pZeroPrefixedNumber :: LParser String
    pZeroPrefixedNumber =
      pSym '0'
        <**> ( (\hex _0 -> _0 : hex)
                <$> pHexadecimal
                  <<|> (\bin _0 -> _0 : bin)
                <$> pBinary
                  <<|> (\digits _0 -> _0 : digits)
                <$> (pDecimal <<|> pNumber)
                  <<|> pure (: [])
             )

    pNumber :: LParser String
    pNumber = (++) <$> pSome pDigit <*> opt pDecimal ""

    pDecimal :: LParser String
    pDecimal = (:) <$> pSym '.' <*> pMany pDigit

    pHexDecimal :: LParser String
    pHexDecimal = (:) <$> pSym '.' <*> pMany pHex

    pHexadecimal :: LParser String
    pHexadecimal = (:) <$> (pSym 'x' <<|> pSym 'X') <*> ((++) <$> pSome pHex <*> opt pHexDecimal "")

    pBinary :: LParser String
    pBinary = (:) <$> (pSym 'b' <<|> pSym 'B') <*> ((++) <$> pSome pBin <*> opt pDecimal "")

    pHex :: LParser Char
    pHex =
      pDigit
        <<|> pSym 'a'
        <<|> pSym 'b'
        <<|> pSym 'c'
        <<|> pSym 'd'
        <<|> pSym 'e'
        <<|> pSym 'f'
        <<|> pSym 'A'
        <<|> pSym 'B'
        <<|> pSym 'C'
        <<|> pSym 'D'
        <<|> pSym 'E'
        <<|> pSym 'F'

    pBin :: LParser Char
    pBin = pSym '0' <<|> pSym '1'

    -- LL/ULL suffix of a number, making it signed/unsigned int64 respectively
    -- http://luajit.org/ext_ffi_api.html#literals
    pLLULL :: LParser String
    pLLULL = pULL <<|> pLL

    pLL :: LParser String
    pLL = "LL" <$ (pSym 'L' <<|> pSym 'l') <* (pSym 'L' <<|> pSym 'l')

    pULL :: LParser String
    pULL = "ULL" <$ (pSym 'U' <<|> pSym 'u') <* pLL

-- Parse the suffix of a number
parseNumberSuffix :: LParser String
parseNumberSuffix = imaginary <<|> extension
  where
    imaginary = (: []) <$> (pSym 'i' <<|> pSym 'I')

    extension =
      (\e s d -> e : s ++ d)
        <$> (pSym 'e' <<|> pSym 'E' <<|> pSym 'p' <<|> pSym 'P')
        <*> opt (pToken "+" <<|> pToken "-") ""
        <*> pSome pDigit

-- | Parse a keyword. Note: It must really a key/word/! This parser makes sure to return an identifier when
-- it's actually an identifier that starts with that keyword.
parseKeyword :: Token -> String -> LParser Token
parseKeyword tok word =
  pToken word
    <**> ((\k -> Identifier . (++) k) <$$> pSome pIdentifierCharacter <<|> const <$> pReturn tok)

-- | Parse just an identifier.
parseIdentifier :: LParser String
parseIdentifier = (:) <$> pNonDigitIdentifierCharacter <*> pMany pIdentifierCharacter

-- | Parse a label.
parseLabel :: LParser Token
parseLabel =
  Label
    <$ pToken "::"
    <*> parseOptionalWhitespace
    <*> parseIdentifier
    <*> parseOptionalWhitespace
    <* pToken "::"

-- | Parse anything to do with dots. Indexaction (.), concatenation (..) or varargs (...)
parseDots :: LParser Token
parseDots =
  pToken "."
    <**> ( -- A dot means it's either a VarArg (...), concatenation (..) or just a dot (.)
           const
            <$> ( pToken "."
                    <**> ( const VarArg
                            <$ pToken "."
                              <<|> const
                            <$> pReturn Concatenate
                         )
                )
              <<|> (\ds sfx dot -> TNumber $ dot ++ ds ++ sfx)
            <$> pSome pDigit
            <*> opt parseNumberSuffix ""
              <<|> const
            <$> pReturn Dot
         )

-- | Parse any kind of token.
parseToken :: LParser Token
parseToken =
  parseComment
    <<|>
    -- Constants
    parseString
    <<|> parseNumber
    <<|> parseKeyword TTrue "true"
    <<|> parseKeyword TFalse "false"
    <<|> parseKeyword Nil "nil"
    <<|> parseKeyword Not "not"
    <<|> parseKeyword And "and"
    <<|> parseKeyword Or "or"
    <<|> parseKeyword Function "function"
    <<|> parseKeyword Local "local"
    <<|> parseKeyword If "if"
    <<|> parseKeyword Then "then"
    <<|> parseKeyword Elseif "elseif"
    <<|> parseKeyword Else "else"
    <<|> parseKeyword For "for"
    <<|> parseKeyword In "in"
    <<|> parseKeyword Do "do"
    <<|> parseKeyword While "while"
    <<|> parseKeyword Until "until"
    <<|> parseKeyword Repeat "repeat"
    <<|> parseKeyword Continue "continue"
    <<|> parseKeyword Break "break"
    <<|> parseKeyword Return "return"
    <<|> parseKeyword End "end"
    <<|> Identifier
    <$> parseIdentifier
      <<|> Semicolon
    <$ pToken ";"
      <<|> parseDots
      <<|>
      -- Operators
      Plus
    <$ pToken "+"
      <<|> Minus
    <$ pToken "-"
      <<|> Multiply
    <$ pToken "*"
      <<|> Modulus
    <$ pToken "%"
      <<|> Power
    <$ pToken "^"
      <<|> TEq
    <$ pToken "=="
      <<|> Equals
    <$ pToken "="
      <<|> TNEq
    <$ pToken "~="
      <<|> TCNEq
    <$ pToken "!="
      <<|> CNot
    <$ pToken "!"
      <<|> TLEQ
    <$ pToken "<="
      <<|> TLT
    <$ pToken "<"
      <<|> TGEQ
    <$ pToken ">="
      <<|> TGT
    <$ pToken ">"
      <<|> parseLabel
      <<|> Colon
    <$ pToken ":"
      <<|> Comma
    <$ pToken ","
      <<|> Hash
    <$ pToken "#"
      `micro` 10
      <<|> CAnd -- Add micro cost to prevent conflict with parseHashBang
    <$ pToken "&&"
      <<|> COr
    <$ pToken "||"
      <<|> LRound
    <$ pToken "("
      <<|> RRound
    <$ pToken ")"
      <<|> LCurly
    <$ pToken "{"
      <<|> RCurly
    <$ pToken "}"
      <<|>
      -- Other square bracket is parsed in parseString
      RSquare
    <$ pToken "]"
      <<|> Whitespace
    <$> parseWhitespace

-- | A thing of which the region is to be parsed
annotated :: (Region -> a -> b) -> LParser a -> LParser b
annotated f p = (\s t e -> f (Region s e) t) <$> pPos <*> p <*> pPos

-- | parse located MToken
parseMToken :: LParser MToken
parseMToken = annotated MToken parseToken

-- | Parse a list of tokens and turn them into MTokens.
parseTokens :: LParser [MToken]
parseTokens = pMany parseMToken

-- | Parse the potential #!comment on the first line
-- Lua ignores the first line if it starts with #
parseHashBang :: LParser String
parseHashBang = opt (pToken "#" <* pUntilEnd) ""

-- | Lex a string with a given lexer
lexFromString :: LParser a -> String -> (a, [Error LineColPos])
lexFromString p = parse ((,) <$> p <*> pErrors <* pEnd) . createStr (LineColPos 0 0 0)

-- | Parse a string into MTokens. Also returns parse errors.
execParseTokens :: String -> ([MToken], [Error LineColPos])
execParseTokens = parse ((,) <$ parseHashBang <*> parseTokens <*> pErrors <* pEnd) . createStr (LineColPos 0 0 0)