{-# LANGUAGE ScopedTypeVariables, DeriveLift #-} {-| Module : Text.ANTLR.Lex.Regex Description : Regular expressions as used during tokenization Copyright : (c) Karl Cronburg, 2018 License : BSD3 Maintainer : karl@cs.tufts.edu Stability : experimental Portability : POSIX -} module Text.ANTLR.Lex.Regex where import Text.ANTLR.Set (Hashable, singleton, fromList) import Text.ANTLR.Lex.NFA import qualified Text.ANTLR.Lex.DFA as DFA import Language.Haskell.TH.Syntax (Lift(..)) -- | Regular expression data representation as used by the tokenizer. data Regex s = Epsilon -- ^ Regex accepting the empty string | Symbol s -- ^ An individual symbol in the alphabet | Literal [s] -- ^ A literal sequence of symbols (concatenated together) | Class [s] -- ^ A set of alternative symbols (unioned together) | Union (Regex s) (Regex s) -- ^ Union of two arbitrary regular expressions | Concat [Regex s] -- ^ Concatenation of 2 or more regular expressions | Kleene (Regex s) -- ^ Kleene closure of a regex | PosClos (Regex s) -- ^ Positive closure | Question (Regex s) -- ^ 0 or 1 instances | MultiUnion [Regex s] -- ^ Union of two or more arbitrary regexs | NotClass [s] -- ^ Complement of a character class deriving (Lift) instance (Show s) => Show (Regex s) where show Epsilon = "ϵ" show (Symbol s) = show s show (Literal s) = show s show (Class s) = "[" ++ show s ++ "]" show (Union r1 r2) = "(" ++ show r1 ++ "|" ++ show r2 ++ ")" show (Concat rs) = concatMap show rs show (Kleene r) = "(" ++ show r ++ ")*" show (PosClos r) = "(" ++ show r ++ ")+" show (Question r) = "(" ++ show r ++ ")?" show (MultiUnion rs) = tail $ concatMap (\r -> "|" ++ show r) rs show (NotClass rs) = "[^" ++ tail (concatMap show rs) ++ "]" -- | Translation code of a regular expresion to an NFA. regex2nfa' :: forall s i. (Hashable i, Ord i, Hashable s, Eq s) => (i -> Int) -> (Int -> i) -> Regex s -> NFA s i regex2nfa' from to r = let r2n :: Regex s -> NFA s i r2n Epsilon = list2nfa [ (to 0, (False, singleton NFAEpsilon), to 1) ] r2n (Symbol s) = list2nfa [ (to 0, (False, singleton $ Edge s), to 1) ] r2n (Union r1 r2) = nfaUnion from to (r2n r1) (r2n r2) r2n (Concat []) = r2n Epsilon -- TODO: empty concat r2n (Concat (r:rs)) = foldl (nfaConcat from to) (r2n r) (map r2n rs) r2n (Kleene r1) = nfaKleene from to (r2n r1) r2n (PosClos r1) = r2n $ Concat [r1, Kleene r1] r2n (Question r1) = nfaUnion from to (r2n r1) (r2n Epsilon) r2n (Class []) = r2n Epsilon -- TODO: empty character class shouldn't accept empty string? r2n (Class (s:ss)) = list2nfa [ (to 0, (False, fromList $ map Edge $ s:ss), to 1) ] --r2n $ foldl Union (Symbol s) (map Symbol ss) r2n (MultiUnion []) = r2n Epsilon r2n (MultiUnion (r:rs)) = r2n $ foldl Union r rs r2n (Literal ss) = list2nfa $ map (\(s,i) -> (to i, (False, singleton $ Edge s), to $ i + 1)) (zip ss [0..length ss - 1]) r2n (NotClass []) = list2nfa $ [ (to 0, (True, fromList []), to 1) ] -- Not nothing = everything r2n (NotClass (s:ss)) = list2nfa $ [ (to 0, (True, fromList $ map Edge $ s:ss), to 1) ] in r2n r -- | Entrypoint for translating a regular expression into an 'NFA' with integer indices. regex2nfa :: (Hashable s, Ord s) => Regex s -> NFA s Int regex2nfa = regex2nfa' id id -- | Entrypoint for translating a regular expression into a 'DFA.DFA' with integer indices. regex2dfa :: (Hashable s, Ord s) => Regex s -> DFA.DFA s (DFAState Int) regex2dfa = nfa2dfa . regex2nfa