{-|
Module      : Hlex
Description : Lexer creation tools
Copyright   : (c) Sebastian Tee, 2023
License     : MIT

Tools needed to create a 'Lexer' from a lexical 'Grammar'.
-}
module Hlex
     ( -- * Example
       -- $example

       -- * Types
       Grammar
     , TokenSyntax(..)
     , Lexer
       -- ** Exceptions
     , LexException(..)
       -- * Functions
     , hlex
     ) where

import Text.Regex.TDFA ((=~))

-- | Exception thrown when a 'Lexer' is unable to lex a string.
data LexException = LexException
  Int -- ^ The line number where the string that couldn't be lexed is located.
  Int -- ^ The column where the string that couldn't be lexed is located.
  String -- ^ The String that couldn't be lexed.
  deriving(Read, Show, Eq)

-- | These are the individual rules that make up a 'Grammar'.
--
-- Takes a __POSIX regular expression__ then converts it to a token or skips it.
data TokenSyntax token
  = Skip -- ^ Skips over any matches.
    String -- ^ Regular expression.
  | Tokenize -- ^ Takes a function that converts the matched string to a token.
    String -- ^ Regular expression.
    (String -> token) -- ^ Function that converts the matched string into a token.
  | JustToken -- ^ Converts any regular expression matches to a given token.
    String -- ^ Regular expression.
    token -- ^ Given token.

type InternalToken token = (String, Maybe (String -> token))

-- | Lexical grammar made up of 'TokenSyntax' rules.
--
-- The __order is important__. The 'Lexer' will apply each 'TokenSyntax' rule in the order listed.
type Grammar token = [TokenSyntax token]

-- | Converts a string into a list of tokens.
-- If the string does not follow the Lexer's 'Grammar' a 'LexException' will be returned.
type Lexer token = String -> Either LexException [token]

tokenizerToInternalToken :: TokenSyntax a -> InternalToken a
tokenizerToInternalToken (Skip regex) = (regex, Nothing)
tokenizerToInternalToken (Tokenize regex toToken) = (regex, Just toToken)
tokenizerToInternalToken (JustToken regex token) = (regex, Just $ const token)

-- | Takes a given 'Grammar' and turns it into a 'Lexer'.
hlex :: Grammar token -> Lexer token
hlex = lexInternal 1 1 . map tokenizerToInternalToken

lexInternal :: Int -> Int -> [InternalToken token] -> Lexer token
lexInternal _ _ _ "" = Right []
lexInternal row col ((regex, t):grammar) program = if null matchedText
  then lexInternal row col grammar program
  else do
    before <- parsedBefore
    after <- parsedAfter
    case t of
      Nothing -> Right $ before ++ after
      Just tk -> Right $ before ++ tk matchedText : after
  where
    (beforeProgram, matchedText, afterProgram) = program =~ regex :: (String, String, String)
    (afterRow, afterCol) = getLastCharPos row col (beforeProgram ++ matchedText)
    parsedBefore = lexInternal row col grammar beforeProgram
    parsedAfter = lexInternal afterRow afterCol ((regex, t):grammar) afterProgram
lexInternal row col _ invalidString = Left $ LexException row col invalidString

getLastCharPos :: Int -> Int -> String -> (Int, Int)
getLastCharPos startRow startCol x = (startRow + addRow, addCol + if addRow == 0 then startCol else 1)
  where
    ls = lines x
    addRow = length ls - 1
    addCol = length $ last ls

{- $example
Here is an example module for a simple language.

@
  module ExampleLang
       ( MyToken(..) -- Export the language's tokens and the lexer
       , myLexer
       ) where

  import Hlex

  data MyToken = Ident String -- String identifier token
               | Number Float -- Number token and numeric value
               | Assign       -- Assignment operator token
               deriving(Show)

  myGrammar :: Grammar MyToken
  myGrammar = [ JustToken "=" Assign                                       -- "=" Operator becomes the assign token
              , Tokenize "[a-zA-Z]+" (\match -> Ident match)                -- Identifier token with string
              , Tokenize "[0-9]+(\\.[0-9]+)?" (\match -> Number (read match) -- Number token with the parsed numeric value stored as a Float
              , Skip "[ \\n\\r\\t]+"                                          -- Skip whitespace
              ]

  myLexer :: Lexer MyToken
  myLexer = hlex myGrammar -- hlex turns a Grammar into a Lexer
@

Here is the lexer being used on a simple program.

>>> lexer "x = 1.2"
Right [Ident "x", Assign, Number 1.2]

The lexer uses 'Either'. Right means the lexer successfully parsed the program to a list of MyTokens.
If Left was returned it would be a 'LexException'.
-}