--  Compiler Toolkit: Self-optimizing lexers
--
--  Author : Manuel M. T. Chakravarty
--  Created: 2 March 99
--
--  Version $Revision: 1.2 $ from $Date: 2004/11/13 17:26:51 $
--
--  Copyright (c) 1999 Manuel M. T. Chakravarty
--
--  This library is free software; you can redistribute it and/or
--  modify it under the terms of the GNU Library General Public
--  License as published by the Free Software Foundation; either
--  version 2 of the License, or (at your option) any later version.
--
--  This library is distributed in the hope that it will be useful,
--  but WITHOUT ANY WARRANTY; without even the implied warranty of
--  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
--  Library General Public License for more details.
--
--- DESCRIPTION ---------------------------------------------------------------
--
--  Self-optimizing lexer combinators.
--
--  For detailed information, see ``Lazy Lexing is Fast'', Manuel
--  M. T. Chakravarty, in A. Middeldorp and T. Sato, editors, Proceedings of
--  Fourth Fuji International Symposium on Functional and Logic Programming,
--  Springer-Verlag, LNCS 1722, 1999.  (See my Web page for details.)
--
--  Thanks to Simon L. Peyton Jones <simonpj@microsoft.com> and Roman
--  Lechtchinsky <wolfro@cs.tu-berlin.de> for their helpful suggestions that
--  improved the design of this library.
--
--- DOCU ----------------------------------------------------------------------
--
--  language: Haskell 98
--
--  The idea is to combine the benefits of off-line generators with
--  combinators like in `Parsers.hs' (which builds on Swierstra/Duponcheel's
--  technique for self-optimizing parser combinators).  In essence, a state
--  transition graph representing a lexer table is computed on the fly, to
--  make lexing deterministic and based on cheap table lookups.
--
--  Regular expression map to Haskell expressions as follows.  If `x' and `y'
--  are regular expressions,
--
--        -> epsilon
--    xy  -> x +> y
--    x*y -> x `star` y
--    x+y -> x `plus` y
--    x?y -> x `quest` y
--
--  Given such a Haskelized regular expression `hre', we can use
--
--    (1) hre `lexaction` \lexeme -> Nothing 
--    (2) hre `lexaction` \lexeme -> Just token
--    (3) hre `lexmeta`   \lexeme pos s -> (res, pos', s', Nothing)
--    (4) hre `lexmeta`   \lexeme pos s -> (res, pos', s', Just l)
--
--  where `epsilon' is required at the end of `hre' if it otherwise ends on
--  `star', `plus', or `quest', and then, we have
--
--    (1) discards `lexeme' accepted by `hre',
--    (2) turns the `lexeme' accepted by `hre' into a token,
--    (3) while discarding the lexeme accepted by `hre', transforms the
--        position and/or user state, and
--    (4) while discarding the lexeme accepted by `hre', transforms the
--        position and/or user state and returns a lexer to be used for the
--        next lexeme.
--
--  The component `res' in case of a meta action, can be `Nothing', `Just
--  (Left err)', or `Just (Right token)' to return nothing, an error, or a
--  token from a meta action, respectively.
--
--  * By adding `ctrlLexer', `Positions' are properly handled in the presence
--    of layout control characters.
--
--  * This module makes essential use of graphical data structures (for
--    representing the state transition graph) and laziness (for maintaining
--    the last action in `execLexer'.
--
--  NOTES:
--
--  * In this implementation, the combinators `quest`, `star`, and `plus` are
--    *right* associative - this was different in the ``Lazy Lexing is Fast''
--    paper.  This change was made on a suggestion by Martin Norb�ck
--    <d95mback@dtek.chalmers.se>.
--
--- TODO ----------------------------------------------------------------------
--
--  * error correction is missing
--
--  * in (>||<) in the last case, `(addBoundsNum bn bn')' is too simple, as
--    the number of outgoing edges is not the sum of the numbers of the
--    individual states when there are conflicting edges, ie, ones labeled
--    with the same character; however, the number is only used to decide a
--    heuristic, so it is questionable whether it is worth spending the
--    additional effort of computing the accurate number
--
--  * Unicode posses a problem as the character domain becomes too big for
--    using arrays to represent transition tables and even sparse structures
--    will posse a significant overhead when character ranges are naively
--    represented.  So, it might be time for finite maps again.  
--
--    Regarding the character ranges, there seem to be at least two
--    possibilities.  Doaitse explicitly uses ranges and avoids expanding
--    them.  The problem with this approach is that we may only have
--    predicates such as `isAlphaNum' to determine whether a givne character
--    belongs to some character class.  From this representation it is
--    difficult to efficiently compute a range.  The second approach, as
--    proposed by Tom Pledger <Tom.Pledger@peace.com> (on the Haskell list)
--    would be to actually use predicates directly and make the whole business
--    efficient by caching predicate queries.  In other words, for any given
--    character after we have determined (in a given state) once what the
--    following state on accepting that character is, we need not consult the
--    predicates again if we memorise the successor state the first time
--    around.
--
--  * Ken Shan <ken@digitas.harvard.edu> writes ``Section 4.3 of your paper
--    computes the definition 
--
--      re1 `star` re2 = \l' -> let self = re1 self >||< re2 l' in self
--
--    If we let re2 = epsilon, we get
--
--      many :: Regexp s t -> Regexp s t
--      many re = \l' -> let self = re1 self >||< l' in self
--
--    since epsilon = id.''  This should actually be as good as the current
--    definiton and it might be worthwhile to offer it as a variant.
--

module Lexers (Regexp, Lexer, Action, epsilon, char, (+>), lexaction,
	       lexactionErr, lexmeta, (>|<), (>||<), ctrlChars, ctrlLexer,
	       star, plus, quest, alt, string, LexerState, execLexer)
where 

import Data.Maybe  (fromMaybe, isNothing)
import Data.Array  (Ix(..), Array, array, (!), assocs, accumArray)

import Position (Position(..), Pos (posOf), nopos, incPos, tabPos, retPos)
import DLists (DList, openDL, zeroDL, unitDL, snocDL, joinDL, closeDL)
import Errors (interr, ErrorLvl(..), Error, makeError)


infixr 4 `quest`, `star`, `plus`
infixl 3 +>, `lexaction`, `lexmeta`
infixl 2 >|<, >||<


-- constants
-- ---------

-- we use the dense representation if a table has at least the given number of 
-- (non-error) elements
--
denseMin :: Int
denseMin :: Int
denseMin  = Int
20


-- data structures
-- ---------------

-- represents the number of (non-error) elements and the bounds of a table
--
type BoundsNum = (Int, Char, Char)

-- empty bounds
--
nullBoundsNum :: BoundsNum
nullBoundsNum :: BoundsNum
nullBoundsNum  = (Int
0, forall a. Bounded a => a
maxBound, forall a. Bounded a => a
minBound)

-- combine two bounds
--
addBoundsNum                            :: BoundsNum -> BoundsNum -> BoundsNum
addBoundsNum :: BoundsNum -> BoundsNum -> BoundsNum
addBoundsNum (Int
n, Char
lc, Char
hc) (Int
n', Char
lc', Char
hc')  = (Int
n forall a. Num a => a -> a -> a
+ Int
n', forall a. Ord a => a -> a -> a
min Char
lc Char
lc', forall a. Ord a => a -> a -> a
max Char
hc Char
hc')

-- check whether a character is in the bounds
--
inBounds               :: Char -> BoundsNum -> Bool
inBounds :: Char -> BoundsNum -> Bool
inBounds Char
c (Int
_, Char
lc, Char
hc)  = Char
c forall a. Ord a => a -> a -> Bool
>= Char
lc Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
hc

-- Lexical actions take a lexeme with its position and may return a token; in
-- a variant, an error can be returned (EXPORTED)
--
--  * if there is no token returned, the current lexeme is discarded lexing
--   continues looking for a token
--
type Action    t = String -> Position -> Maybe t
type ActionErr t = String -> Position -> Either Error t

-- Meta actions transform the lexeme, position, and a user-defined state; they
-- may return a lexer, which is then used for accepting the next token (this
-- is important to implement non-regular behaviour like nested comments)
-- (EXPORTED) 
--
type Meta s t = String -> Position -> s -> (Maybe (Either Error t), -- err/tok?
					    Position,		    -- new pos
					    s,			    -- state
					    Maybe (Lexer s t))	    -- lexer?

-- tree structure used to represent the lexer table (EXPORTED ABSTRACTLY) 
--
--  * each node in the tree corresponds to a state of the lexer; the associated 
--   actions are those that apply when the corresponding state is reached
--
data Lexer s t = Lexer (LexAction s t) (Cont s t)

-- represent the continuation of a lexer
--
data Cont s t = -- on top of the tree, where entries are dense, we use arrays
		--
		Dense BoundsNum (Array Char (Lexer s t))
		--
		-- further down, where the valid entries are sparse, we
		-- use association lists, to save memory (the first argument
		-- is the length of the list)
		--
	      | Sparse BoundsNum [(Char, Lexer s t)]
		--
		-- end of a automaton
		--
	      | Done
--	      deriving Show

-- lexical action (EXPORTED ABSTRACTLY)
--
data LexAction s t = Action   (Meta s t)
		   | NoAction
--		   deriving Show

-- a regular expression (EXPORTED)
--
type Regexp s t = Lexer s t -> Lexer s t


-- basic combinators
-- -----------------

-- Empty lexeme (EXPORTED)
--
epsilon :: Regexp s t
epsilon :: forall s t. Regexp s t
epsilon  = forall a. a -> a
id

-- One character regexp (EXPORTED) 
--
char   :: Char -> Regexp s t
char :: forall s t. Char -> Regexp s t
char Char
c  = \Lexer s t
l -> forall s t. LexAction s t -> Cont s t -> Lexer s t
Lexer forall s t. LexAction s t
NoAction (forall s t. BoundsNum -> [(Char, Lexer s t)] -> Cont s t
Sparse (Int
1, Char
c, Char
c) [(Char
c, Lexer s t
l)])

-- Concatenation of regexps (EXPORTED)
--
(+>) :: Regexp s t -> Regexp s t -> Regexp s t
+> :: forall s t. Regexp s t -> Regexp s t -> Regexp s t
(+>)  = forall b c a. (b -> c) -> (a -> b) -> a -> c
(.)

-- Close a regular expression with an action that converts the lexeme into a
-- token (EXPORTED)
--
--  * Note: After the application of the action, the position is advanced
--	   according to the length of the lexeme.  This implies that normal
--	   actions should not be used in the case where a lexeme might contain 
--	   control characters that imply non-standard changes of the position, 
--	   such as newlines or tabs.
--
lexaction      :: Regexp s t -> Action t -> Lexer s t
lexaction :: forall s t. Regexp s t -> Action t -> Lexer s t
lexaction Regexp s t
re Action t
a  = Regexp s t
re forall s t. Regexp s t -> Meta s t -> Lexer s t
`lexmeta` forall {c} {a} {a}.
[Char]
-> Position -> c -> (Maybe (Either a t), Position, c, Maybe a)
a'
  where
    a' :: [Char]
-> Position -> c -> (Maybe (Either a t), Position, c, Maybe a)
a' [Char]
lexeme pos :: Position
pos@(Position [Char]
fname Int
row Int
col) c
s = 
       let col' :: Int
col' = Int
col forall a. Num a => a -> a -> a
+ forall (t :: * -> *) a. Foldable t => t a -> Int
length [Char]
lexeme
       in
       Int
col' seq :: forall a b. a -> b -> b
`seq` case Action t
a [Char]
lexeme Position
pos of
		    Maybe t
Nothing -> (forall a. Maybe a
Nothing, ([Char] -> Int -> Int -> Position
Position [Char]
fname Int
row Int
col'), c
s, forall a. Maybe a
Nothing)
		    Just t
t  -> (forall a. a -> Maybe a
Just (forall a b. b -> Either a b
Right t
t), ([Char] -> Int -> Int -> Position
Position [Char]
fname Int
row Int
col'), c
s, forall a. Maybe a
Nothing)

-- Variant for actions that may returns an error (EXPORTED)
--
lexactionErr      :: Regexp s t -> ActionErr t -> Lexer s t
lexactionErr :: forall s t. Regexp s t -> ActionErr t -> Lexer s t
lexactionErr Regexp s t
re ActionErr t
a  = Regexp s t
re forall s t. Regexp s t -> Meta s t -> Lexer s t
`lexmeta` forall {c} {a}.
[Char]
-> Position -> c -> (Maybe (Either Error t), Position, c, Maybe a)
a'
  where
     a' :: [Char]
-> Position -> c -> (Maybe (Either Error t), Position, c, Maybe a)
a' [Char]
lexeme pos :: Position
pos@(Position [Char]
fname Int
row Int
col) c
s = 
       let col' :: Int
col' = Int
col forall a. Num a => a -> a -> a
+ forall (t :: * -> *) a. Foldable t => t a -> Int
length [Char]
lexeme
       in
       Int
col' seq :: forall a b. a -> b -> b
`seq` (forall a. a -> Maybe a
Just (ActionErr t
a [Char]
lexeme Position
pos), ([Char] -> Int -> Int -> Position
Position [Char]
fname Int
row Int
col'), c
s, forall a. Maybe a
Nothing)

-- Close a regular expression with a meta action (EXPORTED)
--
--  * Note: Meta actions have to advance the position in dependence of the
--	   lexeme by themselves.
--
lexmeta      :: Regexp s t -> Meta s t -> Lexer s t
lexmeta :: forall s t. Regexp s t -> Meta s t -> Lexer s t
lexmeta Regexp s t
re Meta s t
a  = Regexp s t
re (forall s t. LexAction s t -> Cont s t -> Lexer s t
Lexer (forall s t. Meta s t -> LexAction s t
Action Meta s t
a) forall s t. Cont s t
Done)

-- disjunctive combination of two regexps (EXPORTED)
--
(>|<)      :: Regexp s t -> Regexp s t -> Regexp s t
Regexp s t
re >|< :: forall s t. Regexp s t -> Regexp s t -> Regexp s t
>|< Regexp s t
re'  = \Lexer s t
l -> Regexp s t
re Lexer s t
l forall s t. Lexer s t -> Lexer s t -> Lexer s t
>||< Regexp s t
re' Lexer s t
l

-- disjunctive combination of two lexers (EXPORTED)
--
(>||<)                         :: Lexer s t -> Lexer s t -> Lexer s t
(Lexer LexAction s t
a Cont s t
c) >||< :: forall s t. Lexer s t -> Lexer s t -> Lexer s t
>||< (Lexer LexAction s t
a' Cont s t
c')  = forall s t. LexAction s t -> Cont s t -> Lexer s t
Lexer (forall s t. LexAction s t -> LexAction s t -> LexAction s t
joinActions LexAction s t
a LexAction s t
a') (forall s t. Cont s t -> Cont s t -> Cont s t
joinConts Cont s t
c Cont s t
c')

-- combine two disjunctive continuations
--
joinConts :: Cont s t -> Cont s t -> Cont s t
joinConts :: forall s t. Cont s t -> Cont s t -> Cont s t
joinConts Cont s t
Done Cont s t
c'   = Cont s t
c'
joinConts Cont s t
c    Cont s t
Done = Cont s t
c
joinConts Cont s t
c    Cont s t
c'   = let (BoundsNum
bn , [(Char, Lexer s t)]
cls ) = forall {s} {t}. Cont s t -> (BoundsNum, [(Char, Lexer s t)])
listify Cont s t
c
			  (BoundsNum
bn', [(Char, Lexer s t)]
cls') = forall {s} {t}. Cont s t -> (BoundsNum, [(Char, Lexer s t)])
listify Cont s t
c'
		      in
		      -- note: `addsBoundsNum' can, at this point, only
		      --       approx. the number of *non-overlapping* cases;
		      --       however, the bounds are correct 
		      --
                      forall s t. BoundsNum -> [(Char, Lexer s t)] -> Cont s t
aggregate (BoundsNum -> BoundsNum -> BoundsNum
addBoundsNum BoundsNum
bn BoundsNum
bn') ([(Char, Lexer s t)]
cls forall a. [a] -> [a] -> [a]
++ [(Char, Lexer s t)]
cls')
  where
    listify :: Cont s t -> (BoundsNum, [(Char, Lexer s t)])
listify (Dense  BoundsNum
n Array Char (Lexer s t)
arr) = (BoundsNum
n, forall i e. Ix i => Array i e -> [(i, e)]
assocs Array Char (Lexer s t)
arr)
    listify (Sparse BoundsNum
n [(Char, Lexer s t)]
cls) = (BoundsNum
n, [(Char, Lexer s t)]
cls)
    listify Cont s t
_		   = forall a. [Char] -> a
interr [Char]
"Lexers.listify: Impossible argument!"

-- combine two actions
--
joinActions :: LexAction s t -> LexAction s t -> LexAction s t
joinActions :: forall s t. LexAction s t -> LexAction s t -> LexAction s t
joinActions LexAction s t
NoAction LexAction s t
a'       = LexAction s t
a'
joinActions LexAction s t
a	     LexAction s t
NoAction = LexAction s t
a
joinActions LexAction s t
_        LexAction s t
_        = forall a. [Char] -> a
interr [Char]
"Lexers.>||<: Overlapping actions!"

-- Note: `n' is only an upper bound of the number of non-overlapping cases
--
aggregate :: BoundsNum -> ([(Char, Lexer s t)]) -> Cont s t
aggregate :: forall s t. BoundsNum -> [(Char, Lexer s t)] -> Cont s t
aggregate bn :: BoundsNum
bn@(Int
n, Char
lc, Char
hc) [(Char, Lexer s t)]
cls
  | Int
n forall a. Ord a => a -> a -> Bool
>= Int
denseMin = forall s t. BoundsNum -> Array Char (Lexer s t) -> Cont s t
Dense  BoundsNum
bn (forall i e a.
Ix i =>
(e -> a -> e) -> e -> (i, i) -> [(i, a)] -> Array i e
accumArray forall s t. Lexer s t -> Lexer s t -> Lexer s t
(>||<) forall {s} {t}. Lexer s t
noLexer (Char
lc, Char
hc) [(Char, Lexer s t)]
cls)
  | Bool
otherwise     = forall s t. BoundsNum -> [(Char, Lexer s t)] -> Cont s t
Sparse BoundsNum
bn (forall a b. Eq a => (b -> b -> b) -> [(a, b)] -> [(a, b)]
accum forall s t. Lexer s t -> Lexer s t -> Lexer s t
(>||<) [(Char, Lexer s t)]
cls)
  where
    noLexer :: Lexer s t
noLexer = forall s t. LexAction s t -> Cont s t -> Lexer s t
Lexer forall s t. LexAction s t
NoAction forall s t. Cont s t
Done

-- combine the elements in the association list that have the same key
--
accum :: Eq a => (b -> b -> b) -> [(a, b)] -> [(a, b)]
accum :: forall a b. Eq a => (b -> b -> b) -> [(a, b)] -> [(a, b)]
accum b -> b -> b
f []           = []
accum b -> b -> b
f ((a
k, b
e):[(a, b)]
kes) = 
  let ((a, b)
ke, [(a, b)]
kes') = forall {t}. Eq t => t -> b -> [(t, b)] -> ((t, b), [(t, b)])
gather a
k b
e [(a, b)]
kes
  in
  (a, b)
ke forall a. a -> [a] -> [a]
: forall a b. Eq a => (b -> b -> b) -> [(a, b)] -> [(a, b)]
accum b -> b -> b
f [(a, b)]
kes'
  where
    gather :: t -> b -> [(t, b)] -> ((t, b), [(t, b)])
gather t
k b
e []                             = ((t
k, b
e), [])
    gather t
k b
e (ke' :: (t, b)
ke'@(t
k', b
e'):[(t, b)]
kes) | t
k forall a. Eq a => a -> a -> Bool
== t
k'   = t -> b -> [(t, b)] -> ((t, b), [(t, b)])
gather t
k (b -> b -> b
f b
e b
e') [(t, b)]
kes
				  | Bool
otherwise = let 
						  ((t, b)
ke'', [(t, b)]
kes') = t -> b -> [(t, b)] -> ((t, b), [(t, b)])
gather t
k b
e [(t, b)]
kes
						in
						((t, b)
ke'', (t, b)
ke'forall a. a -> [a] -> [a]
:[(t, b)]
kes')


-- handling of control characters
-- ------------------------------

-- control characters recognized by `ctrlLexer' (EXPORTED)
--
ctrlChars :: [Char]
ctrlChars :: [Char]
ctrlChars  = [Char
'\n', Char
'\r', Char
'\f', Char
'\t']

-- control lexer (EXPORTED)
--
--  * implements proper `Position' management in the presence of the standard
--   layout control characters
--
ctrlLexer :: Lexer s t
ctrlLexer :: forall {s} {t}. Lexer s t
ctrlLexer  =     
       forall s t. Char -> Regexp s t
char Char
'\n' forall s t. Regexp s t -> Meta s t -> Lexer s t
`lexmeta` forall {p} {c} {a} {a}.
p -> Position -> c -> (Maybe a, Position, c, Maybe a)
newline
  forall s t. Lexer s t -> Lexer s t -> Lexer s t
>||< forall s t. Char -> Regexp s t
char Char
'\r' forall s t. Regexp s t -> Meta s t -> Lexer s t
`lexmeta` forall {p} {c} {a} {a}.
p -> Position -> c -> (Maybe a, Position, c, Maybe a)
newline
  forall s t. Lexer s t -> Lexer s t -> Lexer s t
>||< forall s t. Char -> Regexp s t
char Char
'\v' forall s t. Regexp s t -> Meta s t -> Lexer s t
`lexmeta` forall {p} {c} {a} {a}.
p -> Position -> c -> (Maybe a, Position, c, Maybe a)
newline
  forall s t. Lexer s t -> Lexer s t -> Lexer s t
>||< forall s t. Char -> Regexp s t
char Char
'\f' forall s t. Regexp s t -> Meta s t -> Lexer s t
`lexmeta` forall {p} {c} {a} {a}.
p -> Position -> c -> (Maybe a, Position, c, Maybe a)
formfeed
  forall s t. Lexer s t -> Lexer s t -> Lexer s t
>||< forall s t. Char -> Regexp s t
char Char
'\t' forall s t. Regexp s t -> Meta s t -> Lexer s t
`lexmeta` forall {p} {c} {a} {a}.
p -> Position -> c -> (Maybe a, Position, c, Maybe a)
tab
  where
    newline :: p -> Position -> c -> (Maybe a, Position, c, Maybe a)
newline  p
_ Position
pos c
s = (forall a. Maybe a
Nothing, Position -> Position
retPos Position
pos  , c
s, forall a. Maybe a
Nothing)
    formfeed :: p -> Position -> c -> (Maybe a, Position, c, Maybe a)
formfeed p
_ Position
pos c
s = (forall a. Maybe a
Nothing, Position -> Int -> Position
incPos Position
pos Int
1, c
s, forall a. Maybe a
Nothing)
    tab :: p -> Position -> c -> (Maybe a, Position, c, Maybe a)
tab      p
_ Position
pos c
s = (forall a. Maybe a
Nothing, Position -> Position
tabPos Position
pos  , c
s, forall a. Maybe a
Nothing) 


-- non-basic combinators
-- ---------------------

-- x `star` y corresponds to the regular expression x*y (EXPORTED)
--
star :: Regexp s t -> Regexp s t -> Regexp s t
--
-- The definition used below can be obtained by equational reasoning from this
-- one (which is much easier to understand): 
--
--   star re1 re2 = let self = (re1 +> self >|< epsilon) in self +> re2
--
-- However, in the above, `self' is of type `Regexp s t' (ie, a functional),
-- whereas below it is of type `Lexer s t'.  Thus, below we have a graphical
-- body (finite representation of an infinite structure), which doesn't grow
-- with the size of the accepted lexeme - in contrast to the definition using
-- the functional recursion.
--
star :: forall s t. Regexp s t -> Regexp s t -> Regexp s t
star Regexp s t
re1 Regexp s t
re2  = \Lexer s t
l -> let self :: Lexer s t
self = Regexp s t
re1 Lexer s t
self forall s t. Lexer s t -> Lexer s t -> Lexer s t
>||< Regexp s t
re2 Lexer s t
l
		      in 
		      Lexer s t
self

-- x `plus` y corresponds to the regular expression x+y (EXPORTED)
--
plus         :: Regexp s t -> Regexp s t -> Regexp s t
plus :: forall s t. Regexp s t -> Regexp s t -> Regexp s t
plus Regexp s t
re1 Regexp s t
re2  = Regexp s t
re1 forall s t. Regexp s t -> Regexp s t -> Regexp s t
+> (Regexp s t
re1 forall s t. Regexp s t -> Regexp s t -> Regexp s t
`star` Regexp s t
re2)

-- x `quest` y corresponds to the regular expression x?y (EXPORTED)
--
quest         :: Regexp s t -> Regexp s t -> Regexp s t
quest :: forall s t. Regexp s t -> Regexp s t -> Regexp s t
quest Regexp s t
re1 Regexp s t
re2  = (Regexp s t
re1 forall s t. Regexp s t -> Regexp s t -> Regexp s t
+> Regexp s t
re2) forall s t. Regexp s t -> Regexp s t -> Regexp s t
>|< Regexp s t
re2

-- accepts a non-empty set of alternative characters (EXPORTED)
--
alt    :: [Char] -> Regexp s t
--
--  Equiv. to `(foldr1 (>|<) . map char) cs', but much faster
--
alt :: forall s t. [Char] -> Regexp s t
alt []  = forall a. [Char] -> a
interr [Char]
"Lexers.alt: Empty character set!"
alt [Char]
cs  = \Lexer s t
l -> let bnds :: BoundsNum
bnds = (forall (t :: * -> *) a. Foldable t => t a -> Int
length [Char]
cs, forall (t :: * -> *) a. (Foldable t, Ord a) => t a -> a
minimum [Char]
cs, forall (t :: * -> *) a. (Foldable t, Ord a) => t a -> a
maximum [Char]
cs)
		in
		forall s t. LexAction s t -> Cont s t -> Lexer s t
Lexer forall s t. LexAction s t
NoAction (forall s t. BoundsNum -> [(Char, Lexer s t)] -> Cont s t
aggregate BoundsNum
bnds [(Char
c, Lexer s t
l) | Char
c <- [Char]
cs])

-- accept a character sequence (EXPORTED)
--
string    :: String -> Regexp s t
string :: forall s t. [Char] -> Regexp s t
string []  = forall a. [Char] -> a
interr [Char]
"Lexers.string: Empty character set!"
string [Char]
cs  = (forall (t :: * -> *) a. Foldable t => (a -> a -> a) -> t a -> a
foldr1 forall s t. Regexp s t -> Regexp s t -> Regexp s t
(+>) forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a -> b) -> [a] -> [b]
map forall s t. Char -> Regexp s t
char) [Char]
cs


-- execution of a lexer
-- --------------------

-- threaded top-down during lexing (current input, current position, meta
-- state) (EXPORTED)
--
type LexerState s = (String, Position, s)

-- apply a lexer, yielding a token sequence and a list of errors (EXPORTED)
--
--  * Currently, all errors are fatal; thus, the result is undefined in case of 
--   an error (this changes when error correction is added).
--
--  * The final lexer state is returned.
--
--  * The order of the error messages is undefined.
--
execLexer :: Lexer s t -> LexerState s -> ([t], LexerState s, [Error])
--
--  * the following is moderately tuned
--
execLexer :: forall s t.
Lexer s t -> LexerState s -> ([t], LexerState s, [Error])
execLexer Lexer s t
l state :: LexerState s
state@([], Position
_, s
_) = ([], LexerState s
state, [])
execLexer Lexer s t
l LexerState s
state            = 
  case Lexer s t
-> LexerState s
-> (Maybe (Either Error t), Lexer s t, LexerState s)
lexOne Lexer s t
l LexerState s
state of
    (Maybe (Either Error t)
Nothing , Lexer s t
_ , LexerState s
state') -> forall s t.
Lexer s t -> LexerState s -> ([t], LexerState s, [Error])
execLexer Lexer s t
l LexerState s
state'
    (Just Either Error t
res, Lexer s t
l', LexerState s
state') -> let ([t]
ts, LexerState s
final, [Error]
allErrs) = forall s t.
Lexer s t -> LexerState s -> ([t], LexerState s, [Error])
execLexer Lexer s t
l' LexerState s
state'
			      in case Either Error t
res of
			        (Left  Error
err) -> ([t]
ts  , LexerState s
final, Error
errforall a. a -> [a] -> [a]
:[Error]
allErrs)
				(Right t
t  ) -> (t
tforall a. a -> [a] -> [a]
:[t]
ts, LexerState s
final, [Error]
allErrs)
  where
    -- accept a single lexeme
    --
    -- lexOne :: Lexer s t -> LexerState s t
    --	      -> (Either Error (Maybe t), Lexer s t, LexerState s t)
    lexOne :: Lexer s t
-> LexerState s
-> (Maybe (Either Error t), Lexer s t, LexerState s)
lexOne Lexer s t
l0 LexerState s
state = Lexer s t
-> LexerState s
-> DList Char
-> (Maybe (Either Error t), Lexer s t, LexerState s)
-> (Maybe (Either Error t), Lexer s t, LexerState s)
oneLexeme Lexer s t
l0 LexerState s
state forall a. DList a
zeroDL forall {b}. (Maybe (Either Error b), Lexer s t, LexerState s)
lexErr
      where
        -- the result triple of `lexOne' that signals a lexical error;
        -- the result state is advanced by one character for error correction
        --
	lexErr :: (Maybe (Either Error b), Lexer s t, LexerState s)
lexErr = let ([Char]
cs, pos :: Position
pos@(Position [Char]
fname Int
row Int
col), s
s) = LexerState s
state
	             err :: Error
err = ErrorLvl -> Position -> [[Char]] -> Error
makeError ErrorLvl
ErrorErr Position
pos
			     [[Char]
"Lexical error!", 
			      [Char]
"The character " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show (forall a. [a] -> a
head [Char]
cs) 
			      forall a. [a] -> [a] -> [a]
++ [Char]
" does not fit here; skipping it."]
		 in
		 (forall a. a -> Maybe a
Just (forall a b. a -> Either a b
Left Error
err), Lexer s t
l, (forall a. DList a
tail [Char]
cs, ([Char] -> Int -> Int -> Position
Position [Char]
fname Int
row (Int
col forall a. Num a => a -> a -> a
+ Int
1)), s
s))

	-- we take an open list of characters down, where we accumulate the
	-- lexeme; this function returns maybe a token, the next lexer to use
	-- (can be altered by a meta action), the new lexer state, and a list
	-- of errors
	--
	-- we implement the "principle of the longest match" by taking a
	-- potential result quadruple down (in the last argument); the
	-- potential result quadruple is updated whenever we pass by an action 
	-- (different from `NoAction'); initially it is an error result
	--
	-- oneLexeme :: Lexer s t
	--	     -> LexerState
	--	     -> DList Char 
	--	     -> (Maybe (Either Error t), Maybe (Lexer s t), 
	--		 LexerState s t)
	--	     -> (Maybe (Either Error t), Maybe (Lexer s t), 
	--		 LexerState s t)
	oneLexeme :: Lexer s t
-> LexerState s
-> DList Char
-> (Maybe (Either Error t), Lexer s t, LexerState s)
-> (Maybe (Either Error t), Lexer s t, LexerState s)
oneLexeme (Lexer LexAction s t
a Cont s t
cont) state :: LexerState s
state@([Char]
cs, Position
pos, s
s) DList Char
csDL (Maybe (Either Error t), Lexer s t, LexerState s)
last =
	  let last' :: (Maybe (Either Error t), Lexer s t, LexerState s)
last' = LexAction s t
-> DList Char
-> LexerState s
-> (Maybe (Either Error t), Lexer s t, LexerState s)
-> (Maybe (Either Error t), Lexer s t, LexerState s)
action LexAction s t
a DList Char
csDL LexerState s
state (Maybe (Either Error t), Lexer s t, LexerState s)
last
	  in case [Char]
cs of
	    []      -> (Maybe (Either Error t), Lexer s t, LexerState s)
last'
	    (Char
c:[Char]
cs') -> Cont s t
-> Char
-> LexerState s
-> DList Char
-> (Maybe (Either Error t), Lexer s t, LexerState s)
-> (Maybe (Either Error t), Lexer s t, LexerState s)
oneChar Cont s t
cont Char
c ([Char]
cs', Position
pos, s
s) DList Char
csDL (Maybe (Either Error t), Lexer s t, LexerState s)
last'

        oneChar :: Cont s t
-> Char
-> LexerState s
-> DList Char
-> (Maybe (Either Error t), Lexer s t, LexerState s)
-> (Maybe (Either Error t), Lexer s t, LexerState s)
oneChar Cont s t
Done            Char
c LexerState s
state DList Char
csDL (Maybe (Either Error t), Lexer s t, LexerState s)
last = (Maybe (Either Error t), Lexer s t, LexerState s)
last
        oneChar (Dense  BoundsNum
bn Array Char (Lexer s t)
arr) Char
c LexerState s
state DList Char
csDL (Maybe (Either Error t), Lexer s t, LexerState s)
last
	  | Char
c Char -> BoundsNum -> Bool
`inBounds` BoundsNum
bn = Lexer s t
-> Char
-> LexerState s
-> DList Char
-> (Maybe (Either Error t), Lexer s t, LexerState s)
-> (Maybe (Either Error t), Lexer s t, LexerState s)
cont (Array Char (Lexer s t)
arrforall i e. Ix i => Array i e -> i -> e
!Char
c) Char
c LexerState s
state DList Char
csDL (Maybe (Either Error t), Lexer s t, LexerState s)
last
	  | Bool
otherwise       = (Maybe (Either Error t), Lexer s t, LexerState s)
last
	oneChar (Sparse BoundsNum
bn [(Char, Lexer s t)]
cls) Char
c LexerState s
state DList Char
csDL (Maybe (Either Error t), Lexer s t, LexerState s)
last
	  | Char
c Char -> BoundsNum -> Bool
`inBounds` BoundsNum
bn = case forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Char
c [(Char, Lexer s t)]
cls of
			        Maybe (Lexer s t)
Nothing -> (Maybe (Either Error t), Lexer s t, LexerState s)
last
				Just Lexer s t
l' -> Lexer s t
-> Char
-> LexerState s
-> DList Char
-> (Maybe (Either Error t), Lexer s t, LexerState s)
-> (Maybe (Either Error t), Lexer s t, LexerState s)
cont Lexer s t
l' Char
c LexerState s
state DList Char
csDL (Maybe (Either Error t), Lexer s t, LexerState s)
last
          | Bool
otherwise       = (Maybe (Either Error t), Lexer s t, LexerState s)
last

	-- continue within the current lexeme
	--
	cont :: Lexer s t
-> Char
-> LexerState s
-> DList Char
-> (Maybe (Either Error t), Lexer s t, LexerState s)
-> (Maybe (Either Error t), Lexer s t, LexerState s)
cont Lexer s t
l' Char
c LexerState s
state DList Char
csDL (Maybe (Either Error t), Lexer s t, LexerState s)
last = Lexer s t
-> LexerState s
-> DList Char
-> (Maybe (Either Error t), Lexer s t, LexerState s)
-> (Maybe (Either Error t), Lexer s t, LexerState s)
oneLexeme Lexer s t
l' LexerState s
state (DList Char
csDL forall a. DList a -> a -> DList a
`snocDL` Char
c) (Maybe (Either Error t), Lexer s t, LexerState s)
last

	-- execute the action if present and finalise the current lexeme
	--
	action :: LexAction s t
-> DList Char
-> LexerState s
-> (Maybe (Either Error t), Lexer s t, LexerState s)
-> (Maybe (Either Error t), Lexer s t, LexerState s)
action (Action Meta s t
f) DList Char
csDL ([Char]
cs, Position
pos, s
s) (Maybe (Either Error t), Lexer s t, LexerState s)
last = 
	  case Meta s t
f (forall a. DList a -> [a]
closeDL DList Char
csDL) Position
pos s
s of
	    (Maybe (Either Error t)
Nothing, Position
pos', s
s', Maybe (Lexer s t)
l') 
	      | Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (t :: * -> *) a. Foldable t => t a -> Bool
null forall a b. (a -> b) -> a -> b
$ [Char]
cs     -> Lexer s t
-> LexerState s
-> (Maybe (Either Error t), Lexer s t, LexerState s)
lexOne (forall a. a -> Maybe a -> a
fromMaybe Lexer s t
l0 Maybe (Lexer s t)
l') ([Char]
cs, Position
pos', s
s')
	    (Maybe (Either Error t)
res    , Position
pos', s
s', Maybe (Lexer s t)
l') -> (Maybe (Either Error t)
res, (forall a. a -> Maybe a -> a
fromMaybe Lexer s t
l0 Maybe (Lexer s t)
l'), ([Char]
cs, Position
pos', s
s'))
	action LexAction s t
NoAction DList Char
csDL LexerState s
state (Maybe (Either Error t), Lexer s t, LexerState s)
last =
	  (Maybe (Either Error t), Lexer s t, LexerState s)
last						-- no change