{-# LANGUAGE OverloadedStrings #-}

module BNFC.Backend.Haskell.Lexer where

import BNFC.Prelude

import Control.Monad.State

import           Data.List   (intersperse)
import qualified Data.Map    as Map
import           Data.String (fromString)

import Prettyprinter
import System.FilePath (takeBaseName)

import BNFC.Backend.Common.StringUtils (escapeChars)
import BNFC.Backend.Common.Utils as Utils

import BNFC.Backend.CommonInterface.Backend

import BNFC.Backend.Haskell.Layout
import BNFC.Backend.Haskell.Options
import BNFC.Backend.Haskell.State

import BNFC.Backend.Haskell.Utilities.Lexer
import BNFC.Backend.Haskell.Utilities.Utils

import BNFC.CF
import BNFC.Lexing
import BNFC.Options.GlobalOptions

import BNFC.Types.Position
import BNFC.Types.Regex

import qualified BNFC.Utils.List2 as List2


haskellLexer :: LBNF -> State HaskellBackendState Result
haskellLexer lbnf = do
  st <- get
  layout <- haskellLayout lbnf
  let cfName      = takeBaseName $ optInput $ globalOpt st
      inDirectory = inDir $ haskellOpts st
      nSpace      = nameSpace $ haskellOpts st
      tt          = tokenText $ haskellOpts st
      toks        = lexerParserTokens st
      lexerSpecification = cf2lexer lbnf cfName inDirectory nSpace tt toks
  return $
    if layoutsAreUsed lbnf
    then (mkFilePath inDirectory nSpace cfName "Lex" "x", lexerSpecification) : layout
    else [(mkFilePath inDirectory nSpace cfName "Lex" "x", lexerSpecification)]

cf2lexer :: LBNF -> String -> Bool -> Maybe String -> TokenText -> [Token] -> String
cf2lexer lbnf name inDir nameSpace tokenText toks =
  docToString defaultLayoutOptions $ cf2doc lbnf name inDir nameSpace tokenText toks

cf2doc :: LBNF -> String -> Bool -> Maybe String -> TokenText -> [Token] -> Doc ()
cf2doc lbnf cfName inDir nameSpace tokenText toks = (vsep . intersperse emptyDoc)
  [ prelude cfName inDir nameSpace tokenText
  , cMacros
  , rMacros lbnf
  , restOfAlex tokenText toks lbnf
  ]

prelude :: String -> Bool -> Maybe String -> TokenText -> Doc ()
prelude cfName inDir nameSpace tokenText = vsep $
  [ "-- File generated by the BNF Converter."
  , emptyDoc
  , "-- -*- haskell -*-"
  , "-- Lexer definition for use with Alex 3."
  , lbrace
  , "{-# OPTIONS -fno-warn-incomplete-patterns #-}"
  , "{-# OPTIONS_GHC -w #-}"
  , emptyDoc
  , "{-# LANGUAGE PatternSynonyms #-}"
  , emptyDoc
  , "module" <+> fromString (mkModule inDir nameSpace cfName "Lex")  <+> "where"
  , emptyDoc
  , "import Prelude"
  , emptyDoc
  ]
  ++
  Utils.when (tokenText /= StringToken) [ tokenTextImport tokenText, emptyDoc ]
  ++
  [ "import qualified Data.Bits"
  , "import           Data.Char     (ord)"
  , "import           Data.Function (on)"
  , "import           Data.Maybe    (fromMaybe)"
  , "import qualified Data.Map      as Map"
  , "import           Data.Map      (Map)"
  , "import           Data.Word     (Word8)"
  , rbrace
  ]

-- | Character class definitions.

cMacros :: Doc ()
cMacros = vsep
  [ "-- Predefined character classes"
  , emptyDoc
  , "$c = [A-Z\\192-\\221] # [\\215]  -- capital isolatin1 letter (215 = \\times)"
  , "$s = [a-z\\222-\\255] # [\\247]  -- small   isolatin1 letter (247 = \\div  )"
  , "$l = [$c $s]         -- letter"
  , "$d = [0-9]           -- digit"
  , "$i = [$l $d _ ']     -- identifier character"
  , "$u = [. \\n]          -- universal: any character"
  ]

-- | Regular expressions and lex actions.

rMacros :: LBNF -> Doc ()
rMacros lbnf = vsep $ Utils.unless (Map.null (_lbnfSymbols lbnf))
  [ "-- Symbols and non-identifier-like reserved words"
  , emptyDoc
  , "@rsyms =" <+> (hsep . intersperse pipe) (fromString . unwords . esc <$> symbs)
  ]
  where
    symbs :: [String]
    symbs = unicodeAndSymbols lbnf
    esc :: String -> [String]
    esc s = if null a then rest else show a : rest
      where
      (a, r) = span (\ c -> isAscii c && isAlphaNum c) s
      rest = case r of
        []     -> []
        c : xs -> (if isPrint c then ['\\',c] else '\\' : show (ord c)) : esc xs

-- rest af Alex.

restOfAlex :: TokenText -> [Token] -> LBNF -> Doc ()
restOfAlex tokenText toks lbnf = vsep
    [ ":-"
    ,  vsep $ concat
      [ [ emptyDoc ]
      -- Line comments.
      , Utils.when (not (null (_lbnfLineComments lbnf)))
        [ vsep $ lineComment . snd
          <$>
          Map.toList (_lbnfLineComments lbnf)
        , emptyDoc
        ]
      -- Block comments.
      , Utils.when (not (null (_lbnfBlockComments lbnf)))
        [ vsep $ blockComment . snd
          <$>
          Map.toList (_lbnfBlockComments lbnf)
        , emptyDoc
        ]
      ]
    , "-- Whitespace (skipped)"
    , "$white+ ;"
    , vsep $ concat
      [ [ emptyDoc ]
      , Utils.unless (null (unicodeAndSymbols lbnf))
        [ "-- Symbols"
        , "@rsyms"
        , indent 4 "{ tok (eitherResIdent TV) }"
        , emptyDoc
        ]
      , Utils.when (not (null lbnfTokens))
        [ userDefTokenTypes
        , emptyDoc
        ]
      , [ "-- Keywords and Ident"
        , "$l $i*"
        , indent 4 "{ tok (eitherResIdent TV) }"
        , emptyDoc
        ]
      , Utils.when (BString `elem` usedBuiltins)
        [ "-- String"
        , "\\\" ([$u # [\\\" \\\\ \\n]] | (\\\\ (\\\" | \\\\ | \\' | n | t | r | f)))* \\\""
        , indent 4 "{ tok (TL . unescapeInitTail) }"
        , emptyDoc
        ]
      , Utils.when (BChar `elem` usedBuiltins)
        [ "-- Char"
        , "\\\' ($u # [\\\' \\\\] | \\\\ [\\\\ \\\' n t r f]) \\'"
        , indent 4 "{ tok TC }"
        , emptyDoc
        ]
      , Utils.when (BInteger `elem` usedBuiltins)
        [ "-- Integer"
        , "$d+"
        , indent 4 "{ tok TI }"
        , emptyDoc
        ]
      , Utils.when (BDouble  `elem` usedBuiltins)
        [ "-- Double"
        , "$d+ \\. $d+ (e (\\-)? $d+)?"
        , indent 4 "{ tok TD }"
        , emptyDoc
        ]
      ]
    , lbrace
    , "-- | Create a token with position."
    , "tok :: (" <> stringType <> " -> Tok) -> (Posn -> " <> stringType <> " -> Token)"
    , "tok f p = PT p . f"
    , emptyDoc
    , "-- | Token without position."
    , tokDataTypes tokenText toks
    , emptyDoc
    , "-- | Smart constructor for 'Tok' for the sake of backwards compatibility."
    , "pattern TS :: " <> stringType <> " -> Int -> Tok"
    , "pattern TS t i = TK (TokSymbol t i)"
    , emptyDoc
    , "-- | Keyword or symbol tokens have a unique ID."
    , "data TokSymbol = TokSymbol"
    , indent 2 $ vsep
      [ "{ tsText :: " <> stringType
      , indent 4 "-- ^ Keyword or symbol text."
      , ", tsID   :: !Int"
      , indent 4 "-- ^ Unique ID."
      , "} deriving (Show)" ]
    , emptyDoc
    , "-- | Keyword/symbol equality is determined by the unique ID."
    , "instance Eq  TokSymbol where (==)    = (==)    `on` tsID"
    , emptyDoc
    , "-- | Keyword/symbol ordering is determined by the unique ID."
    , "instance Ord TokSymbol where compare = compare `on` tsID"
    , emptyDoc
    , "-- | Token with position."
    , "data Token"
    , "  = PT  Posn Tok"
    , "  | Err Posn"
    , "  deriving (Eq, Show, Ord)"
    , emptyDoc
    , "-- | Pretty print a position."
    , "printPosn :: Posn -> String"
    , "printPosn (Pn _ l c) = \"line \" ++ show l ++ \", column \" ++ show c"
    , emptyDoc
    , "-- | Pretty print the position of the first token in the list."
    , "tokenPos :: [Token] -> String"
    , "tokenPos (t:_) = printPosn (tokenPosn t)"
    , "tokenPos []    = \"end of file\""
    , emptyDoc
    , "-- | Get the position of a token."
    , "tokenPosn :: Token -> Posn"
    , "tokenPosn (PT p _) = p"
    , "tokenPosn (Err p)  = p"
    , emptyDoc
    , "-- | Get line and column of a token."
    , "tokenLineCol :: Token -> (Int, Int)"
    , "tokenLineCol = posLineCol . tokenPosn"
    , emptyDoc
    , "-- | Get line and column of a position."
    , "posLineCol :: Posn -> (Int, Int)"
    , "posLineCol (Pn _ l c) = (l,c)"
    , emptyDoc
    , "-- | Convert a token into \"position token\" form."
    , "mkPosToken :: Token -> ((Int, Int), " <> stringType <> ")"
    , "mkPosToken t = (tokenLineCol t, tokenText t)"
    , emptyDoc
    , tokenTextfunction
    , emptyDoc
    , "-- | Convert a token to a string."
    , "prToken :: Token -> String"
    , "prToken t =" <+> applyP stringUnpack "tokenText t"
    , emptyDoc
    , "-- | Convert potential keyword into token or use fallback conversion."
    , "eitherResIdent :: (" <> stringType <+> "-> Tok) ->" <+> stringType <+> "-> Tok"
    , "eitherResIdent tv s = fromMaybe (tv s) (Map.lookup s resWords)"
    , emptyDoc
    , "-- | The keywords and symbols of the language organized as a Map."
    , if isStringToken tokenText
      then "resWords :: Map String Tok"
      else "resWords :: Map Data.Text.Text Tok"
    , "resWords = Map.fromAscList"
    , indent 2 $ vsep $ zipWith
      (<+>) ("[" : repeat ",") tokenTuples
    , indent 2 rbracket
    , emptyDoc
    , "-- | Unquote string literal."
    , "unescapeInitTail ::" <+> stringType <+> "->" <+> stringType
    , "unescapeInitTail =" <+> fromString stringPack <+>
      ". unesc . tail . " <> fromString stringUnpack
    , "  where"
    , "  unesc s = case s of"
    , "    '\\\\':c:cs | elem c ['\\\"', '\\\\', '\\\''] -> c : unesc cs"
    , "    '\\\\':'n':cs  -> '\\n' : unesc cs"
    , "    '\\\\':'t':cs  -> '\\t' : unesc cs"
    , "    '\\\\':'r':cs  -> '\\r' : unesc cs"
    , "    '\\\\':'f':cs  -> '\\f' : unesc cs"
    , "    '\"':[]       -> []"
    , "    c:cs         -> c : unesc cs"
    , "    _            -> []"
    , emptyDoc
    , "-------------------------------------------------------------------"
    , "-- Alex wrapper code."
    , "-- A modified \"posn\" wrapper."
    , "-------------------------------------------------------------------"
    , emptyDoc
    , "data Posn = Pn !Int !Int !Int"
    , "  deriving (Eq, Show, Ord)"
    , emptyDoc
    , "alexStartPos :: Posn"
    , "alexStartPos = Pn 0 1 1"
    , emptyDoc
    , "alexMove :: Posn -> Char -> Posn"
    , "alexMove (Pn a l c) '\\t' = Pn (a+1)  l     (((c+7) `div` 8)*8+1)"
    , "alexMove (Pn a l c) '\\n' = Pn (a+1) (l+1)   1"
    , "alexMove (Pn a l c) _    = Pn (a+1)  l     (c+1)"
    , emptyDoc
    , "type Byte = Word8"
    , emptyDoc
    , "type AlexInput ="
    , indent 2 $ vsep
      [ "( Posn     -- current position"
      , ", Char     -- previous char"
      , ", [Byte]   -- pending bytes on the current char"
      ,  comma <+> stringType <+> ") -- current input string"
      ]
    , emptyDoc
    , "tokens ::" <+> stringType <+> "-> [Token]"
    , "tokens str = go (alexStartPos, '\\n', [], str)"
    , indent 4 $ vsep
      [ "where"
      , indent 2 $ vsep
        [ "go :: AlexInput -> [Token]"
        , "go inp@(pos, _, _, str) ="
        , indent 2 $ vsep
          [ "case alexScan inp 0 of"
          , indent 2 $ vsep
            [ "AlexEOF                   -> []"
            , "AlexError (pos, _, _, _)  -> [Err pos]"
            , "AlexSkip  inp' len        -> go inp'"
            , "AlexToken inp' len act    -> act pos (" <+> stringTake <+> "len str) : (go inp')"
            ]
          ]
        ]
      ]
    , emptyDoc
    , "alexGetByte :: AlexInput -> Maybe (Byte,AlexInput)"
    , "alexGetByte (p, c, (b:bs), s) = Just (b, (p, c, bs, s))"
    , "alexGetByte (p, _, [], s) ="
    , indent 2 $ vsep
      [ "case" <+> apply stringUncons "s" <+> " of"
      , indent 2 $ vsep
        [ stringNilP <+> "-> Nothing"
        , stringConsP <+> "->"
        , indent 2 $ vsep
          [ "let p'     = alexMove p c"
          , indent 4 "(b:bs) = utf8Encode c"
          , "in p' `seq` Just (b, (p', c, bs, s))"
          ]
        ]
      ]
    , emptyDoc
    , "alexInputPrevChar :: AlexInput -> Char"
    , "alexInputPrevChar (p, c, bs, s) = c"
    , emptyDoc
    , "-- | Encode a Haskell String to a list of Word8 values, in UTF8 format."
    , "utf8Encode :: Char -> [Word8]"
    , "utf8Encode = map fromIntegral . go . ord"
    , indent 2 "where"
    , indent 2 "go oc"
    , indent 3 "| oc <= 0x7f       = [oc]"
    , emptyDoc
    , indent 3 "| oc <= 0x7ff      = [ 0xc0 + (oc `Data.Bits.shiftR` 6)"
    , indent 24 ", 0x80 + oc Data.Bits..&. 0x3f"
    , indent 24 "]"
    , emptyDoc
    , indent 3 "| oc <= 0xffff     = [ 0xe0 + (oc `Data.Bits.shiftR` 12)"
    , indent 24 ", 0x80 + ((oc `Data.Bits.shiftR` 6) Data.Bits..&. 0x3f)"
    , indent 24 ", 0x80 + oc Data.Bits..&. 0x3f"
    , indent 24 "]"
    , indent 3 "| otherwise        = [ 0xf0 + (oc `Data.Bits.shiftR` 18)"
    , indent 24 ", 0x80 + ((oc `Data.Bits.shiftR` 12) Data.Bits..&. 0x3f)"
    , indent 24 ", 0x80 + ((oc `Data.Bits.shiftR` 6) Data.Bits..&. 0x3f)"
    , indent 24 ", 0x80 + oc Data.Bits..&. 0x3f"
    , indent 24 "]"
    , rbrace
    ]

  where

    symbolsKeywords :: [(Doc (), Int)]
    symbolsKeywords =
      fmap
      (\(s,i) -> (fromString (escapeChars $ toList s), i))
      (Map.toList $ _lbnfSymbolsKeywords lbnf)

    tokenTuples :: [Doc ()]
    tokenTuples = map toTokTuple symbolsKeywords

    -- tuple containing resersed word and corresponding token.
    toTokTuple :: (Doc (), Int) -> Doc ()
    toTokTuple (n, i) =
      if isStringToken tokenText
      then tupled [dquotes n, "TS" <+> dquotes n <+> fromString (show i)]
      else tupled
        [ "Data.Text.pack" <+> dquotes n
        , "TS" <+> parens ( "Data.Text.pack" <+> dquotes n) <+> fromString (show i)
        ]

    lineComment :: LineComment -> Doc ()
    lineComment (LineComment s) = vsep
      [ "-- Line comment" <+> s'
      , s' <+> "[.]*" <+> semi
      ]
      where
        s' :: Doc ()
        s' = dquotes (fromString (toList s))

    blockComment :: BlockComment -> Doc ()
    blockComment (BlockComment s1 s2) =
      vsep
      [ "-- Block comment" <+>
        dquotes (fromString (toList s1)) <+>
        dquotes (fromString (toList s2))
      , printRegAlex (mkRegMultilineComment (toList s1) (toList s2)) <> semi
      ]

    lbnfTokens :: TokenDefs
    lbnfTokens =
      if hasIdentifier $ _lbnfTokenDefs lbnf
      then Map.delete ('I' :| "dent") (_lbnfTokenDefs lbnf)
      else _lbnfTokenDefs lbnf

    userDefTokenTypes :: Doc ()
    userDefTokenTypes = (vsep . intersperse emptyDoc) $
      userDefTokenType . (\(a,b) -> (a, (regexToken . wpThing) b))
      <$>
      Map.toList lbnfTokens

    userDefTokenType :: (CatName, Regex) -> Doc ()
    userDefTokenType (name, regex) = vsep
      [ "-- token" <+> fromString (toList name)
      , printRegAlex regex
      , indent 4 $ braces $ space <> "tok" <+> lparen <> "eitherResIdent T_" <>
        fromString (toList name) <> rparen <> space
      ]

    usedBuiltins:: [BuiltinCat]
    usedBuiltins = Map.keys $ _lbnfParserBuiltins lbnf

    tokDataTypes :: TokenText -> [Token] -> Doc ()
    tokDataTypes tt tokens =
      if isStringToken tt
      then
        vsep $
          [ "data Tok"
          , indent 2 "= TK {-# UNPACK #-} !TokSymbol  -- ^ Reserved word or symbol."
          , indent 2 "| TL !String                    -- ^ String literal."
          , indent 2 "| TI !String                    -- ^ Integer literal."
          , indent 2 "| TV !String                    -- ^ Identifier."
          , indent 2 "| TD !String                    -- ^ Float literal."
          , indent 2 "| TC !String                    -- ^ Character literal."
              ]
          ++ (indent 2 . tokDataType <$> filter isUserDefined tokens)
          ++ [space <> space <> "deriving (Eq, Show, Ord)"]
      else
        vsep $
          [ "data Tok"
          , indent 2 "= TK {-# UNPACK #-} !TokSymbol  -- ^ Reserved word or symbol."
          , indent 2 "| TL !Data.Text.Text            -- ^ String literal."
          , indent 2 "| TI !Data.Text.Text            -- ^ Integer literal."
          , indent 2 "| TV !Data.Text.Text            -- ^ Identifier."
          , indent 2 "| TD !Data.Text.Text            -- ^ Float literal."
          , indent 2 "| TC !Data.Text.Text            -- ^ Character literal."
              ]
          ++ (indent 2 . tokDataType <$> filter isUserDefined tokens)
          ++ [space <> space <> "deriving (Eq, Show, Ord)"]

    tokDataType :: Token -> Doc ()
    tokDataType token = vsep
      [ "|" <+> tokenName token <+> "!" <> stringType <> tokenComment token
      ]

    tokenTextfunction :: Doc ()
    tokenTextfunction = vsep $
      [ "-- | Convert a token to its text."
      , "tokenText :: Token -> " <> stringType
      , "tokenText t = case t of"
      , indent 2 $ vsep
        [ "PT _ (TS s _) -> s"
        , "PT _ (TL s)   -> " <> applyP stringPack "show s"
        , "PT _ (TI s)   -> s"
        , "PT _ (TV s)   -> s"
        , "PT _ (TD s)   -> s"
        , "PT _ (TC s)   -> s"
        , "Err _         -> " <> apply stringPack "\"#error\""
        ]
      ]
      ++ [ indent 2 "PT _ (" <> tokenName token <+> "s) -> s"
          | token <- filter isUserDefined toks ]

    (stringType, stringTake, stringUncons, stringPack, stringUnpack, stringNilP, stringConsP) =
      case tokenText of
        StringToken     -> ("String",        "take",    "",          "id",      "id",        "[]",      "(c:s)"     )
        TextToken       -> ("Data.Text.Text", "Data.Text.take", "Data.Text.uncons", "Data.Text.pack", "Data.Text.unpack", "Nothing", "Just (c,s)")


    apply :: String -> String -> Doc ()
    apply ""   s = fromString s
    apply "id" s = fromString s
    apply f    s = fromString f <+> fromString s

    applyP :: String -> String -> Doc ()
    applyP ""   s = fromString s
    applyP "id" s = fromString s
    applyP f    s = fromString f <+> lparen <> fromString s <> rparen

-----------------------------------------------------------

-- Pretty printer for Regex.

-- The top-level printing method.

printRegAlex :: Regex -> Doc ()
printRegAlex = prt 0

-- the printer class does the job
class Print a where
  prt :: Int -> a -> Doc ()

instance {-# OVERLAPPABLE #-} Print a => Print [a] where
  prt i as = hsep $ map (prt i) as

instance Print Char where
  prt _ = \case
    '\n'             -> "\\n"
    '\t'             -> "\\t"
    '\r'             -> "\\r"
    '\f'             -> "\\f"
    c | isAlphaNum c -> fromString [c]
    c | isPrint c    -> fromString $ '\\':[c]
    c                -> fromString $ '\\':show (ord c)

instance Print Regex where
  prt i e = case e of

    RChar (CMinus yes no) ->
      if isEmpty no
      then
        if onlyOneChar yes
        then prt 2 yes
        else
          lbracket <+> prt 2 yes <+> rbracket
      else
        lbracket <+>
        prt 2 yes <+>
        fromString "#" <+>
        prt 2 no <+>
        rbracket

    RAlts  regs -> prPrec i 1 $ hsep $ intersperse "|" $
      map (prt 1) $ List2.toList regs

    RMinus reg1 reg2 ->
      lbracket <+>
      prt 2 reg1 <+>
      fromString "#" <+>
      prt 2 reg2 <+>
      rbracket

    REps -> lparen <> rparen

    RSeqs  regs -> prPrec i 2 $ prt 2 $ List2.toList regs

    RStar  reg  -> prPrec i 3 (prt 3 reg) <> fromString "*"

    RPlus  reg  -> prPrec i 3 (prt 3 reg) <> fromString "+"

    ROpt   reg  -> prPrec i 3 (prt 3 reg) <> fromString "?"

instance Print CharClassUnion where
  prt i e = case e of
    CAny      -> "$u"
    CAlt alts -> hsep $ map (prt i) alts

instance Print CharClassAtom where
  prt _ e = case e of
    CChar c -> prt 0 c
    CDigit  -> fromString "$d"
    CLower  -> fromString "$s"
    CUpper  -> fromString "$c"