-- BNFC 3 -- (C) 2021 Andreas Abel {-# LANGUAGE LambdaCase #-} {-# LANGUAGE PatternGuards #-} -- | Tools to manipulate regular expressions. module BNFC.Types.Regex where import BNFC.Prelude import qualified Data.List as List import qualified BNFC.Utils.List2 as List2 import qualified Data.Set as Set -- * Regular expressions --------------------------------------------------------------------------- -- | Regular expressions are constructed over character classes. -- -- Use smart constructors to ensure invariants. data Regex = RChar CharClass -- ^ Atomic regular expression. | RAlts (List2 Regex) -- ^ Alternative/sum: List free of duplicates and @RAlt@. -- We use list instead of set to preserve the order given by the user. -- Empty list would mean empty language, -- but this is instead represented by the empty character class. | RMinus Regex Regex -- ^ Difference. -- Most lexer generators do not support difference in general, -- only at the level of character classes. -- LBNF has general difference, so it is represented here. | REps -- ^ Language of the empty word (empty sequence). | RSeqs (List2 Regex) -- ^ Sequence/product. List free of @RSeq@. -- Empty list is @eps@ (language of the empty word). | RStar Regex -- ^ 0 or more repetitions. -- 'Regex' isn't 'RStar', 'RPlus', 'ROpt', @'RAlts' []@ nor 'REps'. | RPlus Regex -- ^ 1 or more repetitions. -- 'Regex' isn't 'RStar', 'RPlus', 'ROpt', @'RAlts' []@ nor 'REps'. | ROpt Regex -- ^ 0 or 1 repetitions. -- 'Regex' isn't 'RStar', 'RPlus', 'ROpt', @'RAlts' []@ nor 'REps'. deriving (Eq, Ord, Show) pattern REmpty :: Regex pattern REmpty = RChar CEmpty pattern RAlt :: Regex -> Regex -> Regex pattern RAlt r1 r2 = RAlts (List2 r1 r2 []) pattern RSeq :: Regex -> Regex -> Regex pattern RSeq r1 r2 = RSeqs (List2 r1 r2 []) -- | Check if a regular expression is nullable (accepts the empty string). nullable :: Regex -> Bool nullable = \case RChar _ -> False RMinus r1 r2 -> nullable r1 && not (nullable r2) RAlts rs -> any nullable rs RSeqs rs -> all nullable rs REps -> True RStar _ -> True RPlus r -> nullable r ROpt _ -> True -- | Check if a regular expression matches at least one word. -- -- For differences, this check may err on the positive side. class Satisfiable a where satisfiable :: a -> Bool instance Satisfiable Regex where satisfiable = \case RChar c -> satisfiable c RMinus r _ -> satisfiable r -- approximatively! RAlts rs -> any satisfiable rs RSeqs rs -> all satisfiable rs REps -> True RStar _ -> True RPlus r -> satisfiable r ROpt _ -> True instance Satisfiable CharClass where satisfiable (CMinus c _) = satisfiable c -- approximatively! instance Satisfiable CharClassUnion where satisfiable = \case CAny -> True CAlt cs -> not $ null cs -- * Character classes --------------------------------------------------------------------------- -- | Character classes are regular expressions that recognize -- character sequences of length exactly one. These are often -- distinguished from arbitrary regular expressions in lexer -- generators, e.g. in @alex@. -- -- We represent character classes as a difference of unions of atomic -- character classes. -- -- Semantics: @⟦ CMinus ccYes ccNo ⟧ = ⟦ ccYes ⟧ \ ⟦ ccNo ⟧@ data CharClass = CMinus { ccYes :: CharClassUnion -- ^ Character in question must be in one of these character classes. , ccNo :: CharClassUnion -- ^ Character in question must not be in one of these character classes. -- Must be empty if 'ccYes' is empty. } deriving (Eq, Ord, Show) pattern CEmpty :: CharClass pattern CEmpty = CC CCEmpty pattern CC :: CharClassUnion -> CharClass pattern CC c = c `CMinus` CCEmpty -- | Possibly overlapping union of character classes. data CharClassUnion = CAny -- ^ Any character, LBNF @char@. | CAlt [CharClassAtom] -- ^ Any of the given (≥0) alternatives. List is free of duplicates. deriving (Ord, Show) instance Eq CharClassUnion where CAny == CAny = True CAlt cc1 == CAlt cc2 = Set.fromList cc1 == Set.fromList cc2 CAny == CAlt{} = False CAlt{} == CAny = False pattern CCEmpty :: CharClassUnion pattern CCEmpty = CAlt [] -- | Atomic character class. data CharClassAtom = CChar Char -- ^ A single character. | CDigit -- ^ @0-9@, LBNF @digit@. | CLower -- ^ Lower case character, LBNF @lower@. | CUpper -- ^ Upper case character, LBNF @upper@. deriving (Eq, Ord, Show) -- | Union of character class unions. instance Semigroup CharClassUnion where CAny <> _ = CAny _ <> CAny = CAny CAlt cs <> CAlt cs' = CAlt (cs <> cs') instance Monoid CharClassUnion where mempty = CAlt [] mappend = (<>) -- * Smart constructor for regular expressions. --------------------------------------------------------------------------- rChar :: Char -> Regex rChar = RChar . cChar -- | Simplifications included, but no distributivity. rSeq :: Regex -> Regex -> Regex rSeq = curry $ \case -- 0r = 0 (REmpty , _ ) -> REmpty (_ , REmpty ) -> REmpty -- 1r = r (REps , r ) -> r (r , REps ) -> r -- r*r* = r* (RStar r1, RStar r2) | r1 == r2 -> RStar r1 -- r?r* = r*r? = r* (ROpt r1, RStar r2) | r1 == r2 -> RStar r2 (RStar r1, ROpt r2) | r1 == r2 -> RStar r1 -- r+r* = r*r+ = r+ (RPlus r1, RStar r2) | r1 == r2 -> RPlus r1 (RStar r1, RPlus r2) | r1 == r2 -> RPlus r2 -- r+r? = r?r+ = r+ (RPlus r1, ROpt r2) | r1 == r2 -> RPlus r1 (ROpt r1, RPlus r2) | r1 == r2 -> RPlus r2 -- rr* = r*r = r+ (r1 , RStar r2) | r1 == r2 -> RPlus r2 (RStar r1, r2 ) | r1 == r2 -> RPlus r1 -- Associate (RSeqs r1, RSeqs r2) -> RSeqs $ r1 <> r2 (r , RSeqs rs) -> RSeqs $ List2.cons r rs (RSeqs rs, r ) -> RSeqs $ List2.snoc rs r -- general sequences (r1 , r2 ) -> r1 `RSeq` r2 rSeqs :: [Regex] -> Regex rSeqs = foldr rSeq REps rAlt :: Regex -> Regex -> Regex rAlt = curry $ \case -- 0 + r = r (REmpty , r ) -> r (r , REmpty) -> r -- join character alternatives (RChar c1, RChar c2) -> cAlt c1 c2 -- Associate to the left (RAlts r1, RAlts r2) -> RAlts $ List2.fromList $ nubOrd $ List2.toList r1 <> List2.toList r2 (r , RAlts rs) | r `elem` rs -> RAlts rs | otherwise -> RAlts $ List2.cons r rs (RAlts rs, r ) | r `elem` rs -> RAlts rs | otherwise -> RAlts $ List2.snoc rs r -- general alternatives (r1, r2) | r1 == r2 -> r1 -- idempotency, but not the general case | otherwise -> r1 `RAlt` r2 rAlts :: [Regex] -> Regex rAlts = foldr rAlt REmpty rMinus :: Regex -> Regex -> Regex rMinus = curry $ \case -- 0 - r = 0 (REmpty , _ ) -> REmpty -- r - 0 = r (r , REmpty) -> r -- join character alternatives (RChar c1, RChar c2) -> cMinus c1 c2 -- remove subtracted elements (RAlts rs, RAlts ss) -> case List2.toList rs List.\\ List2.toList ss of [] -> REmpty rs' -> rAlts rs' `RMinus` RAlts ss (r , RAlts rs) | r `elem` rs -> REmpty | otherwise -> r `RMinus` RAlts rs -- r - r = 0 (r1, r2) | r1 == r2 -> REmpty | otherwise -> r1 `RMinus` r2 rStar :: Regex -> Regex rStar = \case REmpty -> REps REps -> REps ROpt r -> RStar r RStar r -> RStar r RPlus r -> RStar r r -> RStar r rPlus :: Regex -> Regex rPlus = \case REmpty -> REmpty REps -> REps ROpt r -> RStar r RStar r -> RStar r RPlus r -> RPlus r r -> RPlus r rOpt :: Regex -> Regex rOpt = \case REmpty -> REps REps -> REps RStar r -> RStar r RPlus r -> RStar r ROpt r -> ROpt r r -> ROpt r -- | Disjunction of two character classes is either a character class again ('RChar') -- or simply the disjunction ('RAlt'). -- -- @(p1 \ m1) ∪ (p2 \ m2) = (p1 ∪ p2) \ (m1 ∪ m2)@ if @p1 ⊥ m2@ and @p2 ⊥ m1@ cAlt :: CharClass -> CharClass -> Regex cAlt c1@(CMinus p1 m1) c2@(CMinus p2 m2) | c1 == cAny || c2 == cAny = RChar cAny | p1 `ccuMinus` m2 == Right p1, p2 `ccuMinus` m1 == Right p2 = RChar $ either id CC $ (p1 <> p2) `ccuMinus` (m1 <> m2) | otherwise = RChar c1 `RAlt` RChar c2 -- | Disjunction of two character classes is either a character class again ('RChar') -- or simply the disjunction ('RMinus'). -- -- @(p1 \ m1) \ (0 \ m2) = p1 \ m1@ -- @(p1 \ m1) \ (p2 \ m2) = p1 \ (m1 ∪ p2)@ if @p1 \ m2 = p1@ cMinus :: CharClass -> CharClass -> Regex cMinus c1@(CMinus p1 m1) c2@(CMinus p2 m2) | p2 == mempty = RChar c1 | p1 `ccuMinus` m2 == Right p1 = RChar $ either id CC $ p1 `ccuMinus` (m1 <> p2) | otherwise = RChar c1 `RMinus` RChar c2 -- * Smart constructors for character classes. --------------------------------------------------------------------------- -- | Match given characters. cChar :: Char -> CharClass cChar c = CC $ CAlt [CChar c] -- | Match any of the given characters. cAlts :: [Char] -> CharClass cAlts cs = CC $ CAlt $ nubOrd $ map CChar cs -- BNFC builtin character classes. cDigit, cLower, cUpper, cLetter, cAny :: CharClass cDigit = cAtom CDigit cLower = cAtom CLower cUpper = cAtom CUpper cLetter = CC $ CAlt [ CLower, CUpper ] cAny = CC CAny -- Embeddings. cAtom :: CharClassAtom -> CharClass cAtom = CC . CAlt . singleton -- | Smart constructor for @CharClass@ from difference.. -- -- Mutually reduce: @(A - B) = (A \ B) - (B \ A)@ ccuMinus :: CharClassUnion -> CharClassUnion -> Either CharClass CharClassUnion ccuMinus = curry $ \case (_ , CAny) -> Right mempty (c1@CAny, c2 ) | c2 == mempty -> Right c1 | otherwise -> Left $ c1 `CMinus` c2 (CAlt cs1, CAlt cs2) | null cs1' || null cs2' -> Right $ CAlt cs1' | otherwise -> Left $ CAlt cs1' `CMinus` CAlt cs2' where cs1' = cs1 List.\\ cs2 cs2' = cs2 List.\\ cs1 -------------------------------------------------------------------- onlyOneChar :: CharClassUnion -> Bool onlyOneChar CAny = True onlyOneChar (CAlt atoms) = length atoms == 1 isEmpty :: CharClassUnion -> Bool isEmpty CAny = False isEmpty (CAlt atoms) = null atoms