{-# LANGUAGE CPP #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE PatternGuards #-}

-- | Tools to manipulate regular expressions.

module BNFC.Regex ( nullable, simpReg ) where

#if !MIN_VERSION_base(4,11,0)
import Data.Semigroup (Semigroup(..))
#endif
import Data.Set (Set)
import qualified Data.Set as Set
import qualified Data.List as List

import BNFC.Abs

-- | Check if a regular expression is nullable (accepts the empty string)
nullable :: Reg -> Bool
nullable :: Reg -> Bool
nullable = \case
  RSeq Reg
r1 Reg
r2   -> Reg -> Bool
nullable Reg
r1 Bool -> Bool -> Bool
&& Reg -> Bool
nullable Reg
r2
  RAlt Reg
r1 Reg
r2   -> Reg -> Bool
nullable Reg
r1 Bool -> Bool -> Bool
|| Reg -> Bool
nullable Reg
r2
  RMinus Reg
r1 Reg
r2 -> Reg -> Bool
nullable Reg
r1 Bool -> Bool -> Bool
&& Bool -> Bool
not (Reg -> Bool
nullable Reg
r2)
  RStar Reg
_      -> Bool
True
  RPlus Reg
r1     -> Reg -> Bool
nullable Reg
r1
  ROpt Reg
_       -> Bool
True
  Reg
REps         -> Bool
True
  RChar Char
_      -> Bool
False
  RAlts String
_      -> Bool
False
  RSeqs String
s      -> String -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null String
s
  Reg
RDigit       -> Bool
False
  Reg
RLetter      -> Bool
False
  Reg
RUpper       -> Bool
False
  Reg
RLower       -> Bool
False
  Reg
RAny         -> Bool
False

-- | Simplification of regular expression, mostly for the purpose
--   of simplifying character alternatives (character classes).
--
--   This may help lexer backends, since often lexer generators
--   have a limited syntax for character classes.
--
simpReg :: Reg -> Reg
simpReg :: Reg -> Reg
simpReg = Reg -> Reg
rloop
  where
  rloop :: Reg -> Reg
rloop = RC -> Reg
forall a. ToReg a => a -> Reg
rx (RC -> Reg) -> (Reg -> RC) -> Reg -> Reg
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Reg -> RC
loop
  loop :: Reg -> RC
  loop :: Reg -> RC
loop = \case
    -- Definitely not character classes:
    -- RSeq   r1 r2 -> Rx $ rloop r1 `rSeq` rloop r2
    RStar  Reg
r     -> Reg -> RC
Rx (Reg -> RC) -> Reg -> RC
forall a b. (a -> b) -> a -> b
$ Reg -> Reg
rStar (Reg -> Reg) -> Reg -> Reg
forall a b. (a -> b) -> a -> b
$ Reg -> Reg
rloop Reg
r
    RPlus  Reg
r     -> Reg -> RC
Rx (Reg -> RC) -> Reg -> RC
forall a b. (a -> b) -> a -> b
$ Reg -> Reg
rPlus (Reg -> Reg) -> Reg -> Reg
forall a b. (a -> b) -> a -> b
$ Reg -> Reg
rloop Reg
r
    ROpt   Reg
r     -> Reg -> RC
Rx (Reg -> RC) -> Reg -> RC
forall a b. (a -> b) -> a -> b
$ Reg -> Reg
rOpt  (Reg -> Reg) -> Reg -> Reg
forall a b. (a -> b) -> a -> b
$ Reg -> Reg
rloop Reg
r
    Reg
REps         -> Reg -> RC
Rx (Reg -> RC) -> Reg -> RC
forall a b. (a -> b) -> a -> b
$ Reg
REps
    RSeqs []     -> Reg -> RC
Rx (Reg -> RC) -> Reg -> RC
forall a b. (a -> b) -> a -> b
$ Reg
REps
    RSeqs s :: String
s@(Char
_:Char
_:String
_) -> Reg -> RC
Rx (Reg -> RC) -> Reg -> RC
forall a b. (a -> b) -> a -> b
$ String -> Reg
RSeqs String
s
    -- Possibly character classes:
    RSeq   Reg
r1 Reg
r2 -> Reg -> RC
loop Reg
r1 RC -> RC -> RC
`rcSeq`   Reg -> RC
loop Reg
r2
    RAlt   Reg
r1 Reg
r2 -> Reg -> RC
loop Reg
r1 RC -> RC -> RC
`rcAlt`   Reg -> RC
loop Reg
r2
    RMinus Reg
r1 Reg
r2 -> Reg -> RC
loop Reg
r1 RC -> RC -> RC
`rcMinus` Reg -> RC
loop Reg
r2
    -- Definitely character classes:
    RSeqs [Char
c]    -> CharClass -> RC
CC (CharClass -> RC) -> CharClass -> RC
forall a b. (a -> b) -> a -> b
$ Char -> CharClass
cChar Char
c
    RChar Char
c      -> CharClass -> RC
CC (CharClass -> RC) -> CharClass -> RC
forall a b. (a -> b) -> a -> b
$ Char -> CharClass
cChar Char
c
    RAlts String
s      -> CharClass -> RC
CC (CharClass -> RC) -> CharClass -> RC
forall a b. (a -> b) -> a -> b
$ String -> CharClass
cAlts String
s
    Reg
RDigit       -> CharClass -> RC
CC (CharClass -> RC) -> CharClass -> RC
forall a b. (a -> b) -> a -> b
$ CharClass
cDigit
    Reg
RLetter      -> CharClass -> RC
CC (CharClass -> RC) -> CharClass -> RC
forall a b. (a -> b) -> a -> b
$ CharClass
cLetter
    Reg
RUpper       -> CharClass -> RC
CC (CharClass -> RC) -> CharClass -> RC
forall a b. (a -> b) -> a -> b
$ CharClass
cUpper
    Reg
RLower       -> CharClass -> RC
CC (CharClass -> RC) -> CharClass -> RC
forall a b. (a -> b) -> a -> b
$ CharClass
cLower
    Reg
RAny         -> CharClass -> RC
CC (CharClass -> RC) -> CharClass -> RC
forall a b. (a -> b) -> a -> b
$ CharClass
cAny


-- | Character classes are regular expressions that recognize
-- character sequences of length exactly one.  These are often
-- distinguished from arbitrary regular expressions in lexer
-- generators, e.g. in @alex@.
--
-- We represent character classes as a difference of unions of atomic
-- character classes.

data CharClass = CMinus { CharClass -> CharClassUnion
_ccYes, CharClass -> CharClassUnion
_ccNo :: CharClassUnion }
  deriving (CharClass -> CharClass -> Bool
(CharClass -> CharClass -> Bool)
-> (CharClass -> CharClass -> Bool) -> Eq CharClass
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: CharClass -> CharClass -> Bool
$c/= :: CharClass -> CharClass -> Bool
== :: CharClass -> CharClass -> Bool
$c== :: CharClass -> CharClass -> Bool
Eq, Eq CharClass
Eq CharClass
-> (CharClass -> CharClass -> Ordering)
-> (CharClass -> CharClass -> Bool)
-> (CharClass -> CharClass -> Bool)
-> (CharClass -> CharClass -> Bool)
-> (CharClass -> CharClass -> Bool)
-> (CharClass -> CharClass -> CharClass)
-> (CharClass -> CharClass -> CharClass)
-> Ord CharClass
CharClass -> CharClass -> Bool
CharClass -> CharClass -> Ordering
CharClass -> CharClass -> CharClass
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: CharClass -> CharClass -> CharClass
$cmin :: CharClass -> CharClass -> CharClass
max :: CharClass -> CharClass -> CharClass
$cmax :: CharClass -> CharClass -> CharClass
>= :: CharClass -> CharClass -> Bool
$c>= :: CharClass -> CharClass -> Bool
> :: CharClass -> CharClass -> Bool
$c> :: CharClass -> CharClass -> Bool
<= :: CharClass -> CharClass -> Bool
$c<= :: CharClass -> CharClass -> Bool
< :: CharClass -> CharClass -> Bool
$c< :: CharClass -> CharClass -> Bool
compare :: CharClass -> CharClass -> Ordering
$ccompare :: CharClass -> CharClass -> Ordering
$cp1Ord :: Eq CharClass
Ord, Int -> CharClass -> ShowS
[CharClass] -> ShowS
CharClass -> String
(Int -> CharClass -> ShowS)
-> (CharClass -> String)
-> ([CharClass] -> ShowS)
-> Show CharClass
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [CharClass] -> ShowS
$cshowList :: [CharClass] -> ShowS
show :: CharClass -> String
$cshow :: CharClass -> String
showsPrec :: Int -> CharClass -> ShowS
$cshowsPrec :: Int -> CharClass -> ShowS
Show)

data CharClassUnion
  = CAny                       -- ^ Any character.
  | CAlt (Set CharClassAtom)  -- ^ Any of the given (≥0) alternatives.
  deriving (CharClassUnion -> CharClassUnion -> Bool
(CharClassUnion -> CharClassUnion -> Bool)
-> (CharClassUnion -> CharClassUnion -> Bool) -> Eq CharClassUnion
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: CharClassUnion -> CharClassUnion -> Bool
$c/= :: CharClassUnion -> CharClassUnion -> Bool
== :: CharClassUnion -> CharClassUnion -> Bool
$c== :: CharClassUnion -> CharClassUnion -> Bool
Eq, Eq CharClassUnion
Eq CharClassUnion
-> (CharClassUnion -> CharClassUnion -> Ordering)
-> (CharClassUnion -> CharClassUnion -> Bool)
-> (CharClassUnion -> CharClassUnion -> Bool)
-> (CharClassUnion -> CharClassUnion -> Bool)
-> (CharClassUnion -> CharClassUnion -> Bool)
-> (CharClassUnion -> CharClassUnion -> CharClassUnion)
-> (CharClassUnion -> CharClassUnion -> CharClassUnion)
-> Ord CharClassUnion
CharClassUnion -> CharClassUnion -> Bool
CharClassUnion -> CharClassUnion -> Ordering
CharClassUnion -> CharClassUnion -> CharClassUnion
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: CharClassUnion -> CharClassUnion -> CharClassUnion
$cmin :: CharClassUnion -> CharClassUnion -> CharClassUnion
max :: CharClassUnion -> CharClassUnion -> CharClassUnion
$cmax :: CharClassUnion -> CharClassUnion -> CharClassUnion
>= :: CharClassUnion -> CharClassUnion -> Bool
$c>= :: CharClassUnion -> CharClassUnion -> Bool
> :: CharClassUnion -> CharClassUnion -> Bool
$c> :: CharClassUnion -> CharClassUnion -> Bool
<= :: CharClassUnion -> CharClassUnion -> Bool
$c<= :: CharClassUnion -> CharClassUnion -> Bool
< :: CharClassUnion -> CharClassUnion -> Bool
$c< :: CharClassUnion -> CharClassUnion -> Bool
compare :: CharClassUnion -> CharClassUnion -> Ordering
$ccompare :: CharClassUnion -> CharClassUnion -> Ordering
$cp1Ord :: Eq CharClassUnion
Ord, Int -> CharClassUnion -> ShowS
[CharClassUnion] -> ShowS
CharClassUnion -> String
(Int -> CharClassUnion -> ShowS)
-> (CharClassUnion -> String)
-> ([CharClassUnion] -> ShowS)
-> Show CharClassUnion
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [CharClassUnion] -> ShowS
$cshowList :: [CharClassUnion] -> ShowS
show :: CharClassUnion -> String
$cshow :: CharClassUnion -> String
showsPrec :: Int -> CharClassUnion -> ShowS
$cshowsPrec :: Int -> CharClassUnion -> ShowS
Show)

data CharClassAtom
  = CChar Char  -- ^ A single character.
  | CDigit      -- ^ @0-9@.
  | CLower      -- ^ Lower case character.
  | CUpper      -- ^ Upper case character.
  deriving (CharClassAtom -> CharClassAtom -> Bool
(CharClassAtom -> CharClassAtom -> Bool)
-> (CharClassAtom -> CharClassAtom -> Bool) -> Eq CharClassAtom
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: CharClassAtom -> CharClassAtom -> Bool
$c/= :: CharClassAtom -> CharClassAtom -> Bool
== :: CharClassAtom -> CharClassAtom -> Bool
$c== :: CharClassAtom -> CharClassAtom -> Bool
Eq, Eq CharClassAtom
Eq CharClassAtom
-> (CharClassAtom -> CharClassAtom -> Ordering)
-> (CharClassAtom -> CharClassAtom -> Bool)
-> (CharClassAtom -> CharClassAtom -> Bool)
-> (CharClassAtom -> CharClassAtom -> Bool)
-> (CharClassAtom -> CharClassAtom -> Bool)
-> (CharClassAtom -> CharClassAtom -> CharClassAtom)
-> (CharClassAtom -> CharClassAtom -> CharClassAtom)
-> Ord CharClassAtom
CharClassAtom -> CharClassAtom -> Bool
CharClassAtom -> CharClassAtom -> Ordering
CharClassAtom -> CharClassAtom -> CharClassAtom
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: CharClassAtom -> CharClassAtom -> CharClassAtom
$cmin :: CharClassAtom -> CharClassAtom -> CharClassAtom
max :: CharClassAtom -> CharClassAtom -> CharClassAtom
$cmax :: CharClassAtom -> CharClassAtom -> CharClassAtom
>= :: CharClassAtom -> CharClassAtom -> Bool
$c>= :: CharClassAtom -> CharClassAtom -> Bool
> :: CharClassAtom -> CharClassAtom -> Bool
$c> :: CharClassAtom -> CharClassAtom -> Bool
<= :: CharClassAtom -> CharClassAtom -> Bool
$c<= :: CharClassAtom -> CharClassAtom -> Bool
< :: CharClassAtom -> CharClassAtom -> Bool
$c< :: CharClassAtom -> CharClassAtom -> Bool
compare :: CharClassAtom -> CharClassAtom -> Ordering
$ccompare :: CharClassAtom -> CharClassAtom -> Ordering
$cp1Ord :: Eq CharClassAtom
Ord, Int -> CharClassAtom -> ShowS
[CharClassAtom] -> ShowS
CharClassAtom -> String
(Int -> CharClassAtom -> ShowS)
-> (CharClassAtom -> String)
-> ([CharClassAtom] -> ShowS)
-> Show CharClassAtom
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [CharClassAtom] -> ShowS
$cshowList :: [CharClassAtom] -> ShowS
show :: CharClassAtom -> String
$cshow :: CharClassAtom -> String
showsPrec :: Int -> CharClassAtom -> ShowS
$cshowsPrec :: Int -> CharClassAtom -> ShowS
Show)

-- -- | Regular expressions are constructed over character classes.
-- --
-- --   We do not simplify at the level of regular expressions;
-- --   this is left to the backend.
-- data Regex
--     = RxAlt Reg Reg
--     | RxMinus Reg Reg
--     | RxSeq Reg Reg
--     | RxStar Reg
--     | RxPlus Reg
--     | RxOpt Reg
--     | RxEps
--     | RxChar CharClass
--   deriving (Eq, Ord, Show)

-- | A regular expression that might be a character class.
data RC
  = Rx Reg
  | CC CharClass

-- * Smart constructors for regular expressions.

rSeq :: Reg -> Reg -> Reg
rSeq :: Reg -> Reg -> Reg
rSeq = ((Reg, Reg) -> Reg) -> Reg -> Reg -> Reg
forall a b c. ((a, b) -> c) -> a -> b -> c
curry (((Reg, Reg) -> Reg) -> Reg -> Reg -> Reg)
-> ((Reg, Reg) -> Reg) -> Reg -> Reg -> Reg
forall a b. (a -> b) -> a -> b
$ \case
  -- 0r = 0
  (RAlts String
"", Reg
_       ) -> String -> Reg
RAlts String
""
  (Reg
_       , RAlts String
"") -> String -> Reg
RAlts String
""
  -- 1r = r
  (Reg
REps    , Reg
r       ) -> Reg
r
  (RSeqs String
"", Reg
r       ) -> Reg
r
  (Reg
r       , Reg
REps    ) -> Reg
r
  (Reg
r       , RSeqs String
"") -> Reg
r
  -- r*r* = r*
  (RStar Reg
r1, RStar Reg
r2) | Reg
r1 Reg -> Reg -> Bool
forall a. Eq a => a -> a -> Bool
== Reg
r2 -> Reg -> Reg
rStar Reg
r1
  -- r+r* = r*r+ = r+
  (RPlus Reg
r1, RStar Reg
r2) | Reg
r1 Reg -> Reg -> Bool
forall a. Eq a => a -> a -> Bool
== Reg
r2 -> Reg -> Reg
rPlus Reg
r1
  (RStar Reg
r1, RPlus Reg
r2) | Reg
r1 Reg -> Reg -> Bool
forall a. Eq a => a -> a -> Bool
== Reg
r2 -> Reg -> Reg
rPlus Reg
r1
  -- rr* = r*r = r+
  (Reg
r1      , RStar Reg
r2) | Reg
r1 Reg -> Reg -> Bool
forall a. Eq a => a -> a -> Bool
== Reg
r2 -> Reg -> Reg
rPlus Reg
r1
  (RStar Reg
r1, Reg
r2      ) | Reg
r1 Reg -> Reg -> Bool
forall a. Eq a => a -> a -> Bool
== Reg
r2 -> Reg -> Reg
rPlus Reg
r1
  -- character sequences
  (RSeqs String
s1, RSeqs String
s2) -> String -> Reg
RSeqs (String -> Reg) -> String -> Reg
forall a b. (a -> b) -> a -> b
$ String
s1 String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
s2
  (RChar Char
c1, RSeqs String
s2) -> String -> Reg
RSeqs (String -> Reg) -> String -> Reg
forall a b. (a -> b) -> a -> b
$ Char
c1 Char -> ShowS
forall a. a -> [a] -> [a]
: String
s2
  (RSeqs String
s1, RChar Char
c2) -> String -> Reg
RSeqs (String -> Reg) -> String -> Reg
forall a b. (a -> b) -> a -> b
$ String
s1 String -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char
c2]
  (RChar Char
c1, RChar Char
c2) -> String -> Reg
RSeqs [ Char
c1, Char
c2 ]
  -- Associate to the left
  (Reg
r1    , RSeq Reg
r2 Reg
r3) -> (Reg
r1 Reg -> Reg -> Reg
`rSeq` Reg
r2) Reg -> Reg -> Reg
`rSeq` Reg
r3
  -- general sequences
  (Reg
r1      , Reg
r2      ) -> Reg
r1 Reg -> Reg -> Reg
`RSeq` Reg
r2

rAlt :: Reg -> Reg -> Reg
rAlt :: Reg -> Reg -> Reg
rAlt = ((Reg, Reg) -> Reg) -> Reg -> Reg -> Reg
forall a b c. ((a, b) -> c) -> a -> b -> c
curry (((Reg, Reg) -> Reg) -> Reg -> Reg -> Reg)
-> ((Reg, Reg) -> Reg) -> Reg -> Reg -> Reg
forall a b. (a -> b) -> a -> b
$ \case
  -- 0 + r = r
  (RAlts String
"", Reg
r       ) -> Reg
r
  -- r + 0 = r
  (Reg
r       , RAlts String
"") -> Reg
r
  -- join character alternatives
  (RAlts String
s1, RAlts String
s2) -> String -> Reg
RAlts (String -> Reg) -> String -> Reg
forall a b. (a -> b) -> a -> b
$ String
s1 String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
s2
  (RChar Char
c1, RAlts String
s2) -> String -> Reg
RAlts (String -> Reg) -> String -> Reg
forall a b. (a -> b) -> a -> b
$ Char
c1 Char -> ShowS
forall a. a -> [a] -> [a]
: String
s2
  (RAlts String
s1, RChar Char
c2) -> String -> Reg
RAlts (String -> Reg) -> String -> Reg
forall a b. (a -> b) -> a -> b
$ String
s1 String -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char
c2]
  (RChar Char
c1, RChar Char
c2) -> String -> Reg
RAlts [ Char
c1, Char
c2 ]
  -- Associate to the left
  (Reg
r1    , RAlt Reg
r2 Reg
r3) -> (Reg
r1 Reg -> Reg -> Reg
`rAlt` Reg
r2) Reg -> Reg -> Reg
`rAlt` Reg
r3
  -- general alternatives
  (Reg
r1, Reg
r2)
     | Reg
r1 Reg -> Reg -> Bool
forall a. Eq a => a -> a -> Bool
== Reg
r2  -> Reg
r1  -- idempotency, but not the general case
     | Bool
otherwise -> Reg
r1 Reg -> Reg -> Reg
`RAlt` Reg
r2

rMinus :: Reg -> Reg -> Reg
rMinus :: Reg -> Reg -> Reg
rMinus = ((Reg, Reg) -> Reg) -> Reg -> Reg -> Reg
forall a b c. ((a, b) -> c) -> a -> b -> c
curry (((Reg, Reg) -> Reg) -> Reg -> Reg -> Reg)
-> ((Reg, Reg) -> Reg) -> Reg -> Reg -> Reg
forall a b. (a -> b) -> a -> b
$ \case
  -- 0 - r = 0
  (RAlts String
"", Reg
_       ) -> String -> Reg
RAlts String
""
  -- r - 0 = r
  (Reg
r       , RAlts String
"") -> Reg
r
  -- join character alternatives
  (RAlts String
s1, RAlts String
s2) -> case String
s1 String -> ShowS
forall a. Eq a => [a] -> [a] -> [a]
List.\\ String
s2 of
    [Char
c] -> Char -> Reg
RChar Char
c
    String
s   -> String -> Reg
RAlts String
s
  (Reg
r1, Reg
r2)
     | Reg
r1 Reg -> Reg -> Bool
forall a. Eq a => a -> a -> Bool
== Reg
r2  -> String -> Reg
RAlts String
""
     | Bool
otherwise -> Reg
r1 Reg -> Reg -> Reg
`RMinus` Reg
r2

rStar :: Reg -> Reg
rStar :: Reg -> Reg
rStar = \case
  Reg
REps     -> Reg
REps
  RSeqs String
"" -> Reg
REps
  RAlts String
"" -> Reg
REps
  ROpt  Reg
r  -> Reg -> Reg
RStar Reg
r
  RStar Reg
r  -> Reg -> Reg
RStar Reg
r
  RPlus Reg
r  -> Reg -> Reg
RStar Reg
r
  Reg
r        -> Reg -> Reg
RStar Reg
r

rPlus :: Reg -> Reg
rPlus :: Reg -> Reg
rPlus = \case
  Reg
REps     -> Reg
REps
  RSeqs String
"" -> Reg
REps
  RAlts String
"" -> String -> Reg
RAlts String
""
  ROpt  Reg
r  -> Reg -> Reg
RStar Reg
r
  RStar Reg
r  -> Reg -> Reg
RStar Reg
r
  RPlus Reg
r  -> Reg -> Reg
RPlus Reg
r
  Reg
r        -> Reg -> Reg
RPlus Reg
r

rOpt :: Reg -> Reg
rOpt :: Reg -> Reg
rOpt = \case
  Reg
REps     -> Reg
REps
  RSeqs String
"" -> Reg
REps
  RAlts String
"" -> Reg
REps
  RStar Reg
r  -> Reg -> Reg
RStar Reg
r
  RPlus Reg
r  -> Reg -> Reg
RStar Reg
r
  ROpt  Reg
r  -> Reg -> Reg
ROpt  Reg
r
  Reg
r        -> Reg -> Reg
ROpt  Reg
r

rcSeq :: RC -> RC -> RC
rcSeq :: RC -> RC -> RC
rcSeq = ((RC, RC) -> RC) -> RC -> RC -> RC
forall a b c. ((a, b) -> c) -> a -> b -> c
curry (((RC, RC) -> RC) -> RC -> RC -> RC)
-> ((RC, RC) -> RC) -> RC -> RC -> RC
forall a b. (a -> b) -> a -> b
$ \case
  (Rx Reg
REps      , RC
r            ) -> RC
r
  (Rx (RSeqs String
""), RC
r            ) -> RC
r
  (RC
r            , Rx Reg
REps      ) -> RC
r
  (RC
r            , Rx (RSeqs String
"")) -> RC
r
  (RC
r1           , RC
r2           ) -> Reg -> RC
Rx (Reg -> RC) -> Reg -> RC
forall a b. (a -> b) -> a -> b
$ RC -> Reg
forall a. ToReg a => a -> Reg
rx RC
r1 Reg -> Reg -> Reg
`rSeq` RC -> Reg
forall a. ToReg a => a -> Reg
rx RC
r2

rcAlt :: RC -> RC -> RC
rcAlt :: RC -> RC -> RC
rcAlt = ((RC, RC) -> RC) -> RC -> RC -> RC
forall a b c. ((a, b) -> c) -> a -> b -> c
curry (((RC, RC) -> RC) -> RC -> RC -> RC)
-> ((RC, RC) -> RC) -> RC -> RC -> RC
forall a b. (a -> b) -> a -> b
$ \case
  -- 0 + r = r + 0 = r
  (Rx (RAlts String
""), RC
r) -> RC
r
  (RC
r, Rx (RAlts String
"")) -> RC
r
  -- other cases
  (CC CharClass
c1, CC CharClass
c2) -> CharClass
c1 CharClass -> CharClass -> RC
`cAlt` CharClass
c2
  (RC
c1   , RC
c2   ) -> Reg -> RC
Rx (Reg -> RC) -> Reg -> RC
forall a b. (a -> b) -> a -> b
$ RC -> Reg
forall a. ToReg a => a -> Reg
rx RC
c1 Reg -> Reg -> Reg
`rAlt` RC -> Reg
forall a. ToReg a => a -> Reg
rx RC
c2

rcMinus :: RC -> RC -> RC
rcMinus :: RC -> RC -> RC
rcMinus = ((RC, RC) -> RC) -> RC -> RC -> RC
forall a b c. ((a, b) -> c) -> a -> b -> c
curry (((RC, RC) -> RC) -> RC -> RC -> RC)
-> ((RC, RC) -> RC) -> RC -> RC -> RC
forall a b. (a -> b) -> a -> b
$ \case
  -- r - 0 = r
  (RC
r    , Rx (RAlts String
"")) -> RC
r
  (CC CharClass
c1, CC CharClass
c2        ) -> CharClass
c1 CharClass -> CharClass -> RC
`cMinus` CharClass
c2
  (RC
c1   , RC
c2           ) -> Reg -> RC
Rx (Reg -> RC) -> Reg -> RC
forall a b. (a -> b) -> a -> b
$ RC -> Reg
forall a. ToReg a => a -> Reg
rx RC
c1 Reg -> Reg -> Reg
`rMinus` RC -> Reg
forall a. ToReg a => a -> Reg
rx RC
c2

class ToReg a where
  rx :: a -> Reg

instance ToReg RC where
  rx :: RC -> Reg
rx (Rx Reg
r) = Reg
r
  rx (CC CharClass
c) = CharClass -> Reg
forall a. ToReg a => a -> Reg
rx CharClass
c

instance ToReg CharClass where
  rx :: CharClass -> Reg
rx (CMinus CharClassUnion
p CharClassUnion
m)
    | CharClassUnion
m CharClassUnion -> CharClassUnion -> Bool
forall a. Eq a => a -> a -> Bool
== CharClassUnion
forall a. Monoid a => a
mempty = CharClassUnion -> Reg
forall a. ToReg a => a -> Reg
rx CharClassUnion
p
    | CharClassUnion
p CharClassUnion -> CharClassUnion -> Bool
forall a. Eq a => a -> a -> Bool
== CharClassUnion
forall a. Monoid a => a
mempty = String -> Reg
RAlts String
""
    | Bool
otherwise = CharClassUnion -> Reg
forall a. ToReg a => a -> Reg
rx CharClassUnion
p Reg -> Reg -> Reg
`RMinus` CharClassUnion -> Reg
forall a. ToReg a => a -> Reg
rx CharClassUnion
m

instance ToReg CharClassUnion where
  rx :: CharClassUnion -> Reg
rx CharClassUnion
CAny      = Reg
RAny
  rx (CAlt Set CharClassAtom
cs) = case [Reg]
rs of
      []  -> String -> Reg
RAlts String
""
      [Reg
r] -> Reg
r
      [Reg]
rs  -> (Reg -> Reg -> Reg) -> [Reg] -> Reg
forall (t :: * -> *) a. Foldable t => (a -> a -> a) -> t a -> a
foldr1 Reg -> Reg -> Reg
RAlt [Reg]
rs
    where
    -- collect elements of cs into St
    start :: St
start = Bool -> Bool -> Bool -> String -> St
St Bool
False Bool
False Bool
False String
""
    step :: St -> CharClassAtom -> St
step St
st = \case
      CChar Char
c -> St
st { stAlts :: String
stAlts = Char
c Char -> ShowS
forall a. a -> [a] -> [a]
: St -> String
stAlts St
st }
      CharClassAtom
CDigit  -> St
st { stDigit :: Bool
stDigit = Bool
True }
      CharClassAtom
CLower  -> St
st { stLower :: Bool
stLower = Bool
True }
      CharClassAtom
CUpper  -> St
st { stUpper :: Bool
stUpper = Bool
True }
    (St Bool
digit Bool
upper Bool
lower String
alts) = (St -> CharClassAtom -> St) -> St -> [CharClassAtom] -> St
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl St -> CharClassAtom -> St
step St
start ([CharClassAtom] -> St) -> [CharClassAtom] -> St
forall a b. (a -> b) -> a -> b
$ Set CharClassAtom -> [CharClassAtom]
forall a. Set a -> [a]
Set.toDescList Set CharClassAtom
cs
    rs :: [Reg]
rs = [[Reg]] -> [Reg]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat
      [ [ Char -> Reg
RChar Char
c    | [Char
c] <- [String
alts]      ]
      , [ String -> Reg
RAlts String
alts | (Char
_:Char
_:String
_) <- [String
alts]  ]
      , [ Reg
RDigit     | Bool
digit              ]
      , [ Reg
RLetter    | Bool
upper Bool -> Bool -> Bool
&& Bool
lower     ]
      , [ Reg
RUpper     | Bool
upper Bool -> Bool -> Bool
&& Bool -> Bool
not Bool
lower ]
      , [ Reg
RLower     | Bool
lower Bool -> Bool -> Bool
&& Bool -> Bool
not Bool
upper ]
      ]

  -- Local state type
data St = St { St -> Bool
stDigit, St -> Bool
stUpper, St -> Bool
stLower :: Bool, St -> String
stAlts :: String }

-- UNUSED:
-- instance ToReg CharClassAtom where
--   rx = \case
--     CChar c -> RChar c
--     CDigit  -> RDigit
--     CLower  -> RLower
--     CUpper  -> RUpper

-- * Constructors for character classes.

-- |
-- @(p1 \ m1) ∪ (p2 \ m2) = (p1 ∪ p2) \ (m1 ∪ m2)@ if @p1 ⊥ m2@ and @p2 ⊥ m1@
cAlt :: CharClass -> CharClass -> RC
cAlt :: CharClass -> CharClass -> RC
cAlt c1 :: CharClass
c1@(CMinus CharClassUnion
p1 CharClassUnion
m1) c2 :: CharClass
c2@(CMinus CharClassUnion
p2 CharClassUnion
m2)
  | CharClass
c1 CharClass -> CharClass -> Bool
forall a. Eq a => a -> a -> Bool
== CharClass
cAny Bool -> Bool -> Bool
|| CharClass
c2 CharClass -> CharClass -> Bool
forall a. Eq a => a -> a -> Bool
== CharClass
cAny     = CharClass -> RC
CC CharClass
cAny
  | CharClassUnion
p1 CharClassUnion -> CharClassUnion -> Either CharClass CharClassUnion
`ccuMinus` CharClassUnion
m2 Either CharClass CharClassUnion
-> Either CharClass CharClassUnion -> Bool
forall a. Eq a => a -> a -> Bool
== CharClassUnion -> Either CharClass CharClassUnion
forall a b. b -> Either a b
Right CharClassUnion
p1,
    CharClassUnion
p2 CharClassUnion -> CharClassUnion -> Either CharClass CharClassUnion
`ccuMinus` CharClassUnion
m1 Either CharClass CharClassUnion
-> Either CharClass CharClassUnion -> Bool
forall a. Eq a => a -> a -> Bool
== CharClassUnion -> Either CharClass CharClassUnion
forall a b. b -> Either a b
Right CharClassUnion
p2 = CharClass -> RC
CC (CharClass -> RC) -> CharClass -> RC
forall a b. (a -> b) -> a -> b
$ (CharClass -> CharClass)
-> (CharClassUnion -> CharClass)
-> Either CharClass CharClassUnion
-> CharClass
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either CharClass -> CharClass
forall a. a -> a
id CharClassUnion -> CharClass
ccu (Either CharClass CharClassUnion -> CharClass)
-> Either CharClass CharClassUnion -> CharClass
forall a b. (a -> b) -> a -> b
$ (CharClassUnion
p1 CharClassUnion -> CharClassUnion -> CharClassUnion
forall a. Semigroup a => a -> a -> a
<> CharClassUnion
p2) CharClassUnion -> CharClassUnion -> Either CharClass CharClassUnion
`ccuMinus` (CharClassUnion
m1 CharClassUnion -> CharClassUnion -> CharClassUnion
forall a. Semigroup a => a -> a -> a
<> CharClassUnion
m2)
  | Bool
otherwise                    = Reg -> RC
Rx (Reg -> RC) -> Reg -> RC
forall a b. (a -> b) -> a -> b
$ CharClass -> Reg
forall a. ToReg a => a -> Reg
rx CharClass
c1 Reg -> Reg -> Reg
`RAlt` CharClass -> Reg
forall a. ToReg a => a -> Reg
rx CharClass
c2

  -- -- | ccuDisjoint p1 m2, ccuDisjoint p2 m1 = CC $ either id ccu $ (p1 <> p2) `ccuMinus` (m1 <> m2)
  -- -- | null m1, null m2 = CC $ ccu (p1 <> p2)

-- |
-- @(p1 \ m1) \ (0 \ m2)  = p1 \ m1@
-- @(p1 \ m1) \ (p2 \ m2) = p1 \ (m1 ∪ p2)@  if @p1 \ m2 = p1@
cMinus :: CharClass -> CharClass -> RC
cMinus :: CharClass -> CharClass -> RC
cMinus c1 :: CharClass
c1@(CMinus CharClassUnion
p1 CharClassUnion
m1) c2 :: CharClass
c2@(CMinus CharClassUnion
p2 CharClassUnion
m2)
  | CharClassUnion
p2 CharClassUnion -> CharClassUnion -> Bool
forall a. Eq a => a -> a -> Bool
== CharClassUnion
forall a. Monoid a => a
mempty                 = CharClass -> RC
CC CharClass
c1
  | CharClassUnion
p1 CharClassUnion -> CharClassUnion -> Either CharClass CharClassUnion
`ccuMinus` CharClassUnion
m2 Either CharClass CharClassUnion
-> Either CharClass CharClassUnion -> Bool
forall a. Eq a => a -> a -> Bool
== CharClassUnion -> Either CharClass CharClassUnion
forall a b. b -> Either a b
Right CharClassUnion
p1 = CharClass -> RC
CC (CharClass -> RC) -> CharClass -> RC
forall a b. (a -> b) -> a -> b
$ (CharClass -> CharClass)
-> (CharClassUnion -> CharClass)
-> Either CharClass CharClassUnion
-> CharClass
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either CharClass -> CharClass
forall a. a -> a
id CharClassUnion -> CharClass
ccu (Either CharClass CharClassUnion -> CharClass)
-> Either CharClass CharClassUnion -> CharClass
forall a b. (a -> b) -> a -> b
$ CharClassUnion
p1 CharClassUnion -> CharClassUnion -> Either CharClass CharClassUnion
`ccuMinus` (CharClassUnion
m1 CharClassUnion -> CharClassUnion -> CharClassUnion
forall a. Semigroup a => a -> a -> a
<> CharClassUnion
p2)
  | Bool
otherwise                    = Reg -> RC
Rx (Reg -> RC) -> Reg -> RC
forall a b. (a -> b) -> a -> b
$ CharClass -> Reg
forall a. ToReg a => a -> Reg
rx CharClass
c1 Reg -> Reg -> Reg
`RMinus` CharClass -> Reg
forall a. ToReg a => a -> Reg
rx CharClass
c2

cChar :: Char -> CharClass
cChar :: Char -> CharClass
cChar Char
c = String -> CharClass
cAlts [Char
c]

cAlts :: String -> CharClass
cAlts :: String -> CharClass
cAlts String
cs = CharClassUnion -> CharClass
ccu (CharClassUnion -> CharClass) -> CharClassUnion -> CharClass
forall a b. (a -> b) -> a -> b
$ Set CharClassAtom -> CharClassUnion
CAlt (Set CharClassAtom -> CharClassUnion)
-> Set CharClassAtom -> CharClassUnion
forall a b. (a -> b) -> a -> b
$ [CharClassAtom] -> Set CharClassAtom
forall a. Ord a => [a] -> Set a
Set.fromList ([CharClassAtom] -> Set CharClassAtom)
-> [CharClassAtom] -> Set CharClassAtom
forall a b. (a -> b) -> a -> b
$ (Char -> CharClassAtom) -> String -> [CharClassAtom]
forall a b. (a -> b) -> [a] -> [b]
map Char -> CharClassAtom
CChar String
cs

cDigit, cLower, cUpper, cLetter, cAny :: CharClass
cDigit :: CharClass
cDigit  = CharClassAtom -> CharClass
cAtom CharClassAtom
CDigit
cLower :: CharClass
cLower  = CharClassAtom -> CharClass
cAtom CharClassAtom
CLower
cUpper :: CharClass
cUpper  = CharClassAtom -> CharClass
cAtom CharClassAtom
CUpper
cLetter :: CharClass
cLetter = CharClassUnion -> CharClass
ccu (CharClassUnion -> CharClass) -> CharClassUnion -> CharClass
forall a b. (a -> b) -> a -> b
$ Set CharClassAtom -> CharClassUnion
CAlt (Set CharClassAtom -> CharClassUnion)
-> Set CharClassAtom -> CharClassUnion
forall a b. (a -> b) -> a -> b
$ [CharClassAtom] -> Set CharClassAtom
forall a. Ord a => [a] -> Set a
Set.fromList [ CharClassAtom
CLower, CharClassAtom
CUpper ]
cAny :: CharClass
cAny    = CharClassUnion -> CharClass
ccu CharClassUnion
CAny

cAtom :: CharClassAtom -> CharClass
cAtom :: CharClassAtom -> CharClass
cAtom = CharClassUnion -> CharClass
ccu (CharClassUnion -> CharClass)
-> (CharClassAtom -> CharClassUnion) -> CharClassAtom -> CharClass
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Set CharClassAtom -> CharClassUnion
CAlt (Set CharClassAtom -> CharClassUnion)
-> (CharClassAtom -> Set CharClassAtom)
-> CharClassAtom
-> CharClassUnion
forall b c a. (b -> c) -> (a -> b) -> a -> c
. CharClassAtom -> Set CharClassAtom
forall a. a -> Set a
Set.singleton

ccu :: CharClassUnion -> CharClass
ccu :: CharClassUnion -> CharClass
ccu = (CharClassUnion -> CharClassUnion -> CharClass
`CMinus` CharClassUnion
forall a. Monoid a => a
mempty)

-- (A \ B) \ (C \ D) = A \ (B ∪ (C \ D))

-- | Mutually reduce:  @(A - B) = (A \ B) - (B \ A)@
ccuMinus :: CharClassUnion -> CharClassUnion -> Either CharClass CharClassUnion
ccuMinus :: CharClassUnion -> CharClassUnion -> Either CharClass CharClassUnion
ccuMinus = ((CharClassUnion, CharClassUnion)
 -> Either CharClass CharClassUnion)
-> CharClassUnion
-> CharClassUnion
-> Either CharClass CharClassUnion
forall a b c. ((a, b) -> c) -> a -> b -> c
curry (((CharClassUnion, CharClassUnion)
  -> Either CharClass CharClassUnion)
 -> CharClassUnion
 -> CharClassUnion
 -> Either CharClass CharClassUnion)
-> ((CharClassUnion, CharClassUnion)
    -> Either CharClass CharClassUnion)
-> CharClassUnion
-> CharClassUnion
-> Either CharClass CharClassUnion
forall a b. (a -> b) -> a -> b
$ \case
  (CharClassUnion
_      , CharClassUnion
CAny)   -> CharClassUnion -> Either CharClass CharClassUnion
forall a b. b -> Either a b
Right CharClassUnion
forall a. Monoid a => a
mempty
  (c1 :: CharClassUnion
c1@CharClassUnion
CAny, CharClassUnion
c2  )
    | CharClassUnion
c2 CharClassUnion -> CharClassUnion -> Bool
forall a. Eq a => a -> a -> Bool
== CharClassUnion
forall a. Monoid a => a
mempty  -> CharClassUnion -> Either CharClass CharClassUnion
forall a b. b -> Either a b
Right (CharClassUnion -> Either CharClass CharClassUnion)
-> CharClassUnion -> Either CharClass CharClassUnion
forall a b. (a -> b) -> a -> b
$ CharClassUnion
c1
    | Bool
otherwise     -> CharClass -> Either CharClass CharClassUnion
forall a b. a -> Either a b
Left  (CharClass -> Either CharClass CharClassUnion)
-> CharClass -> Either CharClass CharClassUnion
forall a b. (a -> b) -> a -> b
$ CharClassUnion
c1 CharClassUnion -> CharClassUnion -> CharClass
`CMinus` CharClassUnion
c2
  (CAlt Set CharClassAtom
cs1, CAlt Set CharClassAtom
cs2)
    | Set CharClassAtom -> Bool
forall a. Set a -> Bool
Set.null Set CharClassAtom
cs1' Bool -> Bool -> Bool
||
      Set CharClassAtom -> Bool
forall a. Set a -> Bool
Set.null Set CharClassAtom
cs2' -> CharClassUnion -> Either CharClass CharClassUnion
forall a b. b -> Either a b
Right (CharClassUnion -> Either CharClass CharClassUnion)
-> CharClassUnion -> Either CharClass CharClassUnion
forall a b. (a -> b) -> a -> b
$ Set CharClassAtom -> CharClassUnion
CAlt Set CharClassAtom
cs1'
    | Bool
otherwise     -> CharClass -> Either CharClass CharClassUnion
forall a b. a -> Either a b
Left  (CharClass -> Either CharClass CharClassUnion)
-> CharClass -> Either CharClass CharClassUnion
forall a b. (a -> b) -> a -> b
$ Set CharClassAtom -> CharClassUnion
CAlt Set CharClassAtom
cs1' CharClassUnion -> CharClassUnion -> CharClass
`CMinus` Set CharClassAtom -> CharClassUnion
CAlt Set CharClassAtom
cs2'
    where
    cs1' :: Set CharClassAtom
cs1' = Set CharClassAtom
cs1 Set CharClassAtom -> Set CharClassAtom -> Set CharClassAtom
forall a. Ord a => Set a -> Set a -> Set a
Set.\\ Set CharClassAtom
cs2
    cs2' :: Set CharClassAtom
cs2' = Set CharClassAtom
cs2 Set CharClassAtom -> Set CharClassAtom -> Set CharClassAtom
forall a. Ord a => Set a -> Set a -> Set a
Set.\\ Set CharClassAtom
cs1

instance Semigroup CharClassUnion where
  CharClassUnion
CAny    <> :: CharClassUnion -> CharClassUnion -> CharClassUnion
<> CharClassUnion
_        = CharClassUnion
CAny
  CharClassUnion
_       <> CharClassUnion
CAny     = CharClassUnion
CAny
  CAlt Set CharClassAtom
cs <> CAlt Set CharClassAtom
cs' = Set CharClassAtom -> CharClassUnion
CAlt (Set CharClassAtom
cs Set CharClassAtom -> Set CharClassAtom -> Set CharClassAtom
forall a. Semigroup a => a -> a -> a
<> Set CharClassAtom
cs')

instance Monoid CharClassUnion where
  mempty :: CharClassUnion
mempty  = Set CharClassAtom -> CharClassUnion
CAlt Set CharClassAtom
forall a. Set a
Set.empty
  mappend :: CharClassUnion -> CharClassUnion -> CharClassUnion
mappend = CharClassUnion -> CharClassUnion -> CharClassUnion
forall a. Semigroup a => a -> a -> a
(<>)