{-|
Module      : Hlex
Description : Lexer creation tools
Copyright   : (c) Sebastian Tee, 2023
License     : MIT

Tools needed to create a 'Lexer' from a lexical 'Grammar'.
-}
module Hlex
     ( -- * Example
       -- $example

       -- * Types
       Grammar
     , TokenSyntax(..)
     , Lexer
       -- ** Exceptions
     , LexException(..)
       -- * Functions
     , hlex
     ) where

import Text.Regex.TDFA ((=~))

-- | Exception thrown when a 'Lexer' is unable to lex a string.
data LexException = LexException
  Int -- ^ The line number where the string that couldn't be lexed is located.
  Int -- ^ The column where the string that couldn't be lexed is located.
  String -- ^ The String that couldn't be lexed.
  deriving(ReadPrec [LexException]
ReadPrec LexException
Int -> ReadS LexException
ReadS [LexException]
forall a.
(Int -> ReadS a)
-> ReadS [a] -> ReadPrec a -> ReadPrec [a] -> Read a
readListPrec :: ReadPrec [LexException]
$creadListPrec :: ReadPrec [LexException]
readPrec :: ReadPrec LexException
$creadPrec :: ReadPrec LexException
readList :: ReadS [LexException]
$creadList :: ReadS [LexException]
readsPrec :: Int -> ReadS LexException
$creadsPrec :: Int -> ReadS LexException
Read, Int -> LexException -> ShowS
[LexException] -> ShowS
LexException -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [LexException] -> ShowS
$cshowList :: [LexException] -> ShowS
show :: LexException -> String
$cshow :: LexException -> String
showsPrec :: Int -> LexException -> ShowS
$cshowsPrec :: Int -> LexException -> ShowS
Show, LexException -> LexException -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: LexException -> LexException -> Bool
$c/= :: LexException -> LexException -> Bool
== :: LexException -> LexException -> Bool
$c== :: LexException -> LexException -> Bool
Eq)

-- | These are the individual rules that make up a 'Grammar'.
--
-- Takes a __POSIX regular expression__ then converts it to a token or skips it.
data TokenSyntax token
  = Skip -- ^ Skips over any matches.
    String -- ^ Regular expression.
  | Tokenize -- ^ Takes a function that converts the matched string to a token.
    String -- ^ Regular expression.
    (String -> token) -- ^ Function that converts the matched string into a token.
  | JustToken -- ^ Converts any regular expression matches to a given token.
    String -- ^ Regular expression.
    token -- ^ Given token.

type InternalToken token = (String, Maybe (String -> token))

-- | Lexical grammar made up of 'TokenSyntax' rules.
--
-- The __order is important__. The 'Lexer' will apply each 'TokenSyntax' rule in the order listed.
type Grammar token = [TokenSyntax token]

-- | Converts a string into a list of tokens.
-- If the string does not follow the Lexer's 'Grammar' a 'LexException' will be returned.
type Lexer token = String -> Either LexException [token]

tokenizerToInternalToken :: TokenSyntax a -> InternalToken a
tokenizerToInternalToken :: forall a. TokenSyntax a -> InternalToken a
tokenizerToInternalToken (Skip String
regex) = (String
regex, forall a. Maybe a
Nothing)
tokenizerToInternalToken (Tokenize String
regex String -> a
toToken) = (String
regex, forall a. a -> Maybe a
Just String -> a
toToken)
tokenizerToInternalToken (JustToken String
regex a
token) = (String
regex, forall a. a -> Maybe a
Just forall a b. (a -> b) -> a -> b
$ \String
_ -> a
token)

-- | Takes a given 'Grammar' and turns it into a 'Lexer'.
hlex :: Grammar token -> Lexer token
hlex :: forall token. Grammar token -> Lexer token
hlex = forall token. Int -> Int -> [InternalToken token] -> Lexer token
lexInternal Int
1 Int
1 forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a -> b) -> [a] -> [b]
map forall a. TokenSyntax a -> InternalToken a
tokenizerToInternalToken

lexInternal :: Int -> Int -> [InternalToken token] -> Lexer token
lexInternal :: forall token. Int -> Int -> [InternalToken token] -> Lexer token
lexInternal Int
_ Int
_ [InternalToken token]
_ String
"" = forall a b. b -> Either a b
Right []
lexInternal Int
row Int
col ((String
regex, Maybe (String -> token)
t):[InternalToken token]
grammar) String
program = if forall (t :: * -> *) a. Foldable t => t a -> Bool
null String
matchedText
  then forall token. Int -> Int -> [InternalToken token] -> Lexer token
lexInternal Int
row Int
col [InternalToken token]
grammar String
program
  else do
    [token]
before <- Either LexException [token]
parsedBefore
    [token]
after <- Either LexException [token]
parsedAfter
    case Maybe (String -> token)
t of
      Maybe (String -> token)
Nothing -> forall a b. b -> Either a b
Right forall a b. (a -> b) -> a -> b
$ [token]
before forall a. [a] -> [a] -> [a]
++ [token]
after
      Just String -> token
tk -> forall a b. b -> Either a b
Right forall a b. (a -> b) -> a -> b
$ [token]
before forall a. [a] -> [a] -> [a]
++ String -> token
tk String
matchedText forall a. a -> [a] -> [a]
: [token]
after
  where
    (String
beforeProgram, String
matchedText, String
afterProgram) = String
program forall source source1 target.
(RegexMaker Regex CompOption ExecOption source,
 RegexContext Regex source1 target) =>
source1 -> source -> target
=~ String
regex :: (String, String, String)
    (Int
afterRow, Int
afterCol) = Int -> Int -> String -> (Int, Int)
getLastCharPos Int
row Int
col (String
beforeProgram forall a. [a] -> [a] -> [a]
++ String
matchedText)
    parsedBefore :: Either LexException [token]
parsedBefore = forall token. Int -> Int -> [InternalToken token] -> Lexer token
lexInternal Int
row Int
col [InternalToken token]
grammar String
beforeProgram
    parsedAfter :: Either LexException [token]
parsedAfter = forall token. Int -> Int -> [InternalToken token] -> Lexer token
lexInternal Int
afterRow Int
afterCol ((String
regex, Maybe (String -> token)
t)forall a. a -> [a] -> [a]
:[InternalToken token]
grammar) String
afterProgram
lexInternal Int
row Int
col [InternalToken token]
_ String
invalidString = forall a b. a -> Either a b
Left forall a b. (a -> b) -> a -> b
$ Int -> Int -> String -> LexException
LexException Int
row Int
col String
invalidString

getLastCharPos :: Int -> Int -> String -> (Int, Int)
getLastCharPos :: Int -> Int -> String -> (Int, Int)
getLastCharPos Int
startRow Int
startCol String
x = (Int
startRow forall a. Num a => a -> a -> a
+ Int
addRow, Int
addCol forall a. Num a => a -> a -> a
+ if Int
addRow forall a. Eq a => a -> a -> Bool
== Int
0 then Int
startCol else Int
1)
  where
    ls :: [String]
ls = String -> [String]
lines String
x
    addRow :: Int
addRow = forall (t :: * -> *) a. Foldable t => t a -> Int
length [String]
ls forall a. Num a => a -> a -> a
- Int
1
    addCol :: Int
addCol = forall (t :: * -> *) a. Foldable t => t a -> Int
length forall a b. (a -> b) -> a -> b
$ forall a. [a] -> a
last [String]
ls

{- $example
Here is an example module for a simple language.

@
  module ExampleLang
       ( MyToken(..) -- Export the language's tokens and the lexer
       , myLexer
       ) where

  import Hlex

  data MyToken = Ident String -- String identifier token
               | Number Float -- Number token and numeric value
               | Assign       -- Assignment operator token
               deriving(Show)

  myGrammar :: Grammar MyToken
  myGrammar = [ JustToken "=" Assign                                       -- "=" Operator becomes the assign token
              , Tokenize "[a-zA-Z]+" (\match -> Ident match)                -- Identifier token with string
              , Tokenize "[0-9]+(\\.[0-9]+)?" (\match -> Number (read match) -- Number token with the parsed numeric value stored as a Float
              , Skip "[ \\n\\r\\t]+"                                          -- Skip whitespace
              ]

  myLexer :: Lexer MyToken
  myLexer = hlex myGrammar -- hlex turns a Grammar into a Lexer
@

Here is the lexer being used on a simple program.

>>> lexer "x = 1.2"
Right [Ident "x", Assign, Number 1.2]

The lexer uses 'Either'. Right means the lexer successfully parsed the program to a list of MyTokens.
If Left was returned it would be a 'LexException'.
-}