module Text.Regex.XMLSchema.Generic.Regex
( GenRegex
, mkZero
, mkZero'
, mkUnit
, mkSym
, mkSym1
, mkSymRng
, mkWord
, mkDot
, mkStar
, mkAll
, mkAlt
, mkElse
, mkSeq
, mkSeqs
, mkRep
, mkRng
, mkOpt
, mkDiff
, mkIsect
, mkExor
, mkInterleave
, mkCompl
, mkBr
, mkBr'
, isZero
, errRegex
, nullable
, nullable'
, delta1
, delta
, firstChars
, matchWithRegex
, matchWithRegex'
, splitWithRegex
, splitWithRegex'
, splitWithRegexCS
, splitWithRegexCS'
)
where
import Data.List (intercalate)
import Data.Monoid ((<>))
import Data.Set.CharSet
import Data.String (IsString(..))
import Text.Regex.XMLSchema.Generic.StringLike
data GenRegex s
= Zero s
| Unit
| Sym CharSet
| Dot
| Star (GenRegex s)
| Alt (GenRegex s) (GenRegex s)
| Else (GenRegex s) (GenRegex s)
| Seq (GenRegex s) (GenRegex s)
| Rep Int (GenRegex s)
| Rng Int Int (GenRegex s)
| Diff (GenRegex s) (GenRegex s)
| Isec (GenRegex s) (GenRegex s)
| Exor (GenRegex s) (GenRegex s)
| Intl (GenRegex s) (GenRegex s)
| Br (Label s) (GenRegex s)
| Obr (Label s) s !Int (GenRegex s)
| Cbr [(Label s, s)] (GenRegex s)
deriving (Eq, Ord )
type Label s
= Maybe s
type SubexResults s
= [(Label s, s)]
type Nullable s
= (Bool, SubexResults s)
mkZero :: s -> GenRegex s
mkZero = Zero
mkZero' :: (StringLike s) =>
String -> GenRegex s
mkZero' = Zero . fromString
mkUnit :: GenRegex s
mkUnit = Unit
mkSym :: (StringLike s) =>
CharSet -> GenRegex s
mkSym s
| nullCS s = mkZero' "empty char range"
| fullCS s = mkDot
| otherwise = Sym s
mkSym1 :: (StringLike s) =>
Char -> GenRegex s
mkSym1 = mkSym . singleCS
mkSymRng :: (StringLike s) =>
Char -> Char -> GenRegex s
mkSymRng c1 c2 = mkSym $ rangeCS c1 c2
mkWord :: (StringLike s) =>
[Char] -> GenRegex s
mkWord = mkSeqs . map mkSym1
mkDot :: GenRegex s
mkDot = Dot
mkAll :: (StringLike s) =>
GenRegex s
mkAll = mkStar mkDot
mkStar :: (StringLike s) =>
GenRegex s -> GenRegex s
mkStar (Zero _) = mkUnit
mkStar e@Unit = e
mkStar e@(Star _e1) = e
mkStar (Rep 1 e1) = mkStar e1
mkStar (Rep i e1)
| i == 1
||
nullable e1 = mkStar e1
mkStar e@(Rng _ _ e1)
| nullable e = mkStar e1
mkStar e@(Alt _ _) = Star (rmStar e)
mkStar e = Star e
rmStar :: (StringLike s) =>
GenRegex s -> GenRegex s
rmStar (Alt e1 e2) = mkAlt (rmStar e1) (rmStar e2)
rmStar (Star e1) = rmStar e1
rmStar (Rep 1 e1) = rmStar e1
rmStar e1 = e1
mkAlt :: (StringLike s) =>
GenRegex s -> GenRegex s -> GenRegex s
mkAlt e1 (Zero _) = e1
mkAlt (Zero _) e2 = e2
mkAlt (Sym p1) (Sym p2) = mkSym $ p1 `unionCS` p2
mkAlt e1 e2@(Sym _) = mkAlt e2 e1
mkAlt e1@(Sym _) (Alt e2@(Sym _) e3) = mkAlt (mkAlt e1 e2) e3
mkAlt (Sym _) e2@Dot = e2
mkAlt e1@(Star Dot) _e2 = e1
mkAlt _e1 e2@(Star Dot) = e2
mkAlt (Alt e1 e2) e3 = mkAlt e1 (mkAlt e2 e3)
mkAlt e1 e2
| e1 == e2 = e1
| otherwise = Alt e1 e2
mkElse :: (StringLike s) =>
GenRegex s -> GenRegex s -> GenRegex s
mkElse e1 (Zero _) = e1
mkElse (Zero _) e2 = e2
mkElse (Sym p1) (Sym p2) = mkSym $ p1 `unionCS` p2
mkElse e1@(Sym _) (Else e2@(Sym _) e3) = mkElse (mkElse e1 e2) e3
mkElse (Sym _) e2@Dot = e2
mkElse e1@(Star Dot) _e2 = e1
mkElse _e1 e2@(Star Dot) = e2
mkElse (Else e1 e2) e3 = mkElse e1 (mkElse e2 e3)
mkElse e1 e2
| e1 == e2 = e1
| otherwise = Else e1 e2
mkSeq :: GenRegex s -> GenRegex s -> GenRegex s
mkSeq e1@(Zero _) _e2 = e1
mkSeq _e1 e2@(Zero _) = e2
mkSeq Unit e2 = e2
mkSeq (Cbr ss1 e1) e2 = mkCbr ss1 (mkSeq e1 e2)
mkSeq e1 Unit = e1
mkSeq (Seq e1 e2) e3 = mkSeq e1 (mkSeq e2 e3)
mkSeq e1 e2 = Seq e1 e2
mkSeqs :: [GenRegex s] -> GenRegex s
mkSeqs = foldr mkSeq mkUnit
mkRep :: (StringLike s) =>
Int -> GenRegex s -> GenRegex s
mkRep 0 e = mkStar e
mkRep _ e@(Zero _) = e
mkRep _ e
| nullable e = mkStar e
mkRep i (Rep j e) = mkRep (i * j) e
mkRep i e = Rep i e
mkRng :: (StringLike s) =>
Int -> Int -> GenRegex s -> GenRegex s
mkRng 0 0 _e = mkUnit
mkRng 1 1 e = e
mkRng lb ub _e
| lb > ub = mkZero' $
"illegal range " ++
show lb ++ ".." ++ show ub
mkRng _l _u e@(Zero _) = e
mkRng _l _u e@Unit = e
mkRng lb ub e = Rng lb ub e
mkOpt :: (StringLike s) =>
GenRegex s -> GenRegex s
mkOpt = mkRng 0 1
mkDiff :: (StringLike s) =>
GenRegex s -> GenRegex s -> GenRegex s
mkDiff e1@(Zero _) _e2 = e1
mkDiff e1 (Zero _) = e1
mkDiff _e1 (Star Dot) = mkZero' "empty set in difference expr"
mkDiff Dot (Sym p) = mkSym $ compCS p
mkDiff (Sym _) Dot = mkZero' "empty set in difference expr"
mkDiff (Sym p1) (Sym p2) = mkSym $ p1 `diffCS` p2
mkDiff e1 e2
| e1 == e2 = mkZero' "empty set in difference expr"
| otherwise = Diff e1 e2
mkCompl :: (StringLike s) =>
GenRegex s -> GenRegex s
mkCompl (Zero _) = mkAll
mkCompl (Star Dot) = mkZero' "empty set in compl expr"
mkCompl e = mkDiff (mkStar mkDot) e
mkIsect :: (StringLike s) =>
GenRegex s -> GenRegex s -> GenRegex s
mkIsect e1@(Zero _) _e2 = e1
mkIsect _e1 e2@(Zero _) = e2
mkIsect e1@(Unit) e2
| nullable e2 = e1
| otherwise = mkZero' "intersection empty"
mkIsect e1 e2@(Unit) = mkIsect e2 e1
mkIsect (Sym p1) (Sym p2) = mkSym $ p1 `intersectCS` p2
mkIsect e1@(Sym _) Dot = e1
mkIsect Dot e2@(Sym _) = e2
mkIsect (Star Dot) e2 = e2
mkIsect e1 (Star Dot) = e1
mkIsect e1 e2
| e1 == e2 = e1
| otherwise = Isec e1 e2
mkExor :: (StringLike s) =>
GenRegex s -> GenRegex s -> GenRegex s
mkExor (Zero _) e2 = e2
mkExor e1 (Zero _) = e1
mkExor (Star Dot) _e2 = mkZero' "empty set in exor expr"
mkExor _e1 (Star Dot) = mkZero' "empty set in exor expr"
mkExor (Sym p1) (Sym p2) = mkSym $ p1 `exorCS` p2
mkExor (Sym p1) Dot = mkSym $ compCS p1
mkExor Dot (Sym p2) = mkSym $ compCS p2
mkExor e1 e2
| e1 == e2 = mkZero' "empty set in exor expr"
| otherwise = Exor e1 e2
mkInterleave :: GenRegex s -> GenRegex s -> GenRegex s
mkInterleave e1@(Zero _) _ = e1
mkInterleave _ e2@(Zero _) = e2
mkInterleave (Unit) e2 = e2
mkInterleave e1 (Unit) = e1
mkInterleave e1 e2 = Intl e1 e2
mkBr :: s -> GenRegex s -> GenRegex s
mkBr l e = Br (Just l) e
mkBr' :: StringLike s =>
String -> GenRegex s -> GenRegex s
mkBr' l e = Br (Just $ fromString l) e
mkBrN :: GenRegex s -> GenRegex s
mkBrN e = Br Nothing e
mkObr :: StringLike s =>
Label s -> s -> Int -> GenRegex s -> GenRegex s
mkObr _ _ _ e@(Zero _) = e
mkObr l s n Unit = mkCbr [(l, takeS n s)] mkUnit
mkObr l s n e = Obr l s n e
mkCbr :: SubexResults s -> GenRegex s -> GenRegex s
mkCbr _ e@(Zero _) = e
mkCbr ss (Cbr ss1 e) = mkCbr (ss <> ss1) e
mkCbr ss e = Cbr ss e
instance (StringLike s) => Show (GenRegex s) where
show (Zero e) = "{" ++ toString e ++ "}"
show Unit = "()"
show (Sym p)
| p == compCS (stringCS "\n\r")
= "."
| null (tail cs) &&
rng1 (head cs)
= escRng . head $ cs
| otherwise = "[" ++ concat cs' ++ "]"
where
rng1 (x,y) = x == y
cs = p
cs' = map escRng p
escRng (x, y)
| x == y = esc x
| succ x == y
= esc x ++ esc y
| otherwise
= esc x ++ "-" ++ esc y
esc x
| x `elem` "\\-[]{}()*+?.^"
= '\\':x:""
| x >= ' ' && x <= '~'
= x:""
| otherwise
= "&#" ++ show (fromEnum x) ++ ";"
show Dot = "\\a"
show (Star Dot) = "\\A"
show (Star e) = "(" ++ show e ++ "*)"
show (Alt e1 e2) = "(" ++ show e1 ++ "|" ++ show e2 ++ ")"
show (Else e1 e2) = "(" ++ show e1 ++ "{|}" ++ show e2 ++ ")"
show (Seq e1 e2) = "(" ++ show e1 ++ show e2 ++ ")"
show (Rep 1 e) = "(" ++ show e ++ "+)"
show (Rep i e) = "(" ++ show e ++ "{" ++ show i ++ ",})"
show (Rng 0 1 e) = "(" ++ show e ++ "?)"
show (Rng i j e) = "(" ++ show e ++ "{" ++ show i ++ "," ++ show j ++ "})"
show (Diff e1 e2) = "(" ++ show e1 ++ "{\\}" ++ show e2 ++ ")"
show (Isec e1 e2) = "(" ++ show e1 ++ "{&}" ++ show e2 ++ ")"
show (Exor e1 e2) = "(" ++ show e1 ++ "{^}" ++ show e2 ++ ")"
show (Intl e1 e2) = "(" ++ show e1 ++ "{:}" ++ show e2 ++ ")"
show (Br l e) = "({" ++ showL l ++ "}" ++ show e ++ ")"
show (Obr l s n e) = "({" ++ showL l ++ "=" ++ toString (takeS n s) ++ "}" ++ show e ++ ")"
show (Cbr ss e) = "([" ++ intercalate "," (map (\ (l, s) -> showL l ++ "=" ++ toString s) ss) ++ "]"
++ show e ++
")"
showL :: Show s => Label s -> String
showL = rmq . maybe "" show
where
rmq ('\"':xs) = init xs
rmq xs = xs
isZero :: GenRegex s -> Bool
isZero (Zero _) = True
isZero _ = False
errRegex :: (StringLike s) =>
GenRegex s -> s
errRegex (Zero e) = e
errRegex _ = emptyS
nullable :: (StringLike s) =>
GenRegex s -> Bool
nullable = fst . nullable'
nullable' :: (StringLike s) =>
GenRegex s -> Nullable s
nullable' (Zero _) = (False, [])
nullable' Unit = (True, [])
nullable' Dot = (False, [])
nullable' (Sym _x) = (False, [])
nullable' (Star _e) = (True, [])
nullable' (Rep _i e) = nullable' e
nullable' (Rng i _ e) = (i == 0, []) `unionN` nullable' e
nullable' (Seq e1 e2) = nullable' e1 `isectN` nullable' e2
nullable' (Alt e1 e2) = nullable' e1 `unionN` nullable' e2
nullable' (Else e1 e2) = nullable' e1 `orElseN` nullable' e2
nullable' (Isec e1 e2) = nullable' e1 `isectN` nullable' e2
nullable' (Diff e1 e2) = nullable' e1 `diffN` nullable' e2
nullable' (Exor e1 e2) = nullable' e1 `exorN` nullable' e2
nullable' (Intl e1 e2) = nullable' e1 `isectN` nullable' e2
nullable' (Br l e) = (True, [(l, emptyS )]) `isectN` nullable' e
nullable' (Obr l s n e) = (True, [(l, takeS n s)]) `isectN` nullable' e
nullable' (Cbr ss e) = (True, ss) `isectN` nullable' e
isectN :: Nullable s -> Nullable s -> Nullable s
isectN (True, ws1) (True, ws2) = (True, ws1 ++ ws2)
isectN _ _ = (False, [])
unionN :: Nullable s -> Nullable s -> Nullable s
unionN (False, _) (False, _) = (False, [])
unionN (_, ws1) (_, ws2) = (True, ws1 ++ ws2)
orElseN :: Nullable s -> Nullable s -> Nullable s
orElseN e1@(True, _ws1) _ = e1
orElseN _ e2 = e2
diffN :: Nullable s -> Nullable s -> Nullable s
diffN n1 (False, _) = n1
diffN _ _ = (False, [])
exorN :: Nullable s -> Nullable s -> Nullable s
exorN n1@(True, _) (False, _) = n1
exorN (False, _) n2@(True, _) = n2
exorN _ _ = (False, [])
firstChars :: (StringLike s) =>
GenRegex s -> CharSet
firstChars (Zero _) = emptyCS
firstChars Unit = emptyCS
firstChars (Sym p) = p
firstChars Dot = allCS
firstChars (Star e1) = firstChars e1
firstChars (Alt e1 e2) = firstChars e1 `unionCS` firstChars e2
firstChars (Else e1 e2) = firstChars e1 `unionCS` firstChars e2
firstChars (Seq e1 e2)
| nullable e1 = firstChars e1 `unionCS` firstChars e2
| otherwise = firstChars e1
firstChars (Rep _i e) = firstChars e
firstChars (Rng _i _j e) = firstChars e
firstChars (Diff e1 _e2) = firstChars e1
firstChars (Isec e1 e2) = firstChars e1 `intersectCS` firstChars e2
firstChars (Exor e1 e2) = firstChars e1 `unionCS` firstChars e2
firstChars (Intl e1 e2) = firstChars e1 `unionCS` firstChars e2
firstChars (Br _l e) = firstChars e
firstChars (Obr _l _s _n e) = firstChars e
firstChars (Cbr _ss e) = firstChars e
delta1 :: (StringLike s) => Char -> s -> GenRegex s -> GenRegex s
delta1 c inp e0
= d' e0
where
d' e@(Zero _) = e
d' Unit = mkZero' $
"unexpected char " ++ show c
d' (Sym p)
| c `elemCS` p = mkUnit
| otherwise = mkZero' $
"unexpected char " ++ show c
d' Dot = mkUnit
d' e@(Star Dot) = e
d' e@(Star e1) = mkSeq (d' e1) e
d' (Alt e1 e2) = mkAlt (d' e1) (d' e2)
d' (Else e1 e2) = mkElse (d' e1) (d' e2)
d' (Seq e1@(Obr l s n e1') e2)
| nu = mkAlt (mkSeq (d' e1) e2)
(mkCbr ((l, takeS n s) : ws) (d' e2))
where
(nu, ws) = nullable' e1'
d' (Seq e1 e2)
| nullable e1 = mkAlt (mkSeq (d' e1) e2)
(d' e2)
| otherwise = mkSeq (d' e1) e2
d' (Rep i e) = mkSeq (d' e) (mkRep (i1) e)
d' (Rng i j e) = mkSeq (d' e) (mkRng ((i1) `max` 0) (j1) e)
d' (Diff e1 e2) = mkDiff (d' e1) (d' e2)
d' (Isec e1 e2) = mkIsect (d' e1) (d' e2)
d' (Exor e1 e2) = mkExor (d' e1) (d' e2)
d' (Intl e1 e2) = mkAlt (mkInterleave (d' e1) e2 )
(mkInterleave e1 (d' e2))
d' (Br l e) = d' (mkObr l inp 0 e)
d' (Obr l s n e) = mkObr l s (n + 1) (d' e)
d' (Cbr ss e) = mkCbr ss (d' e)
delta :: (StringLike s) => s -> GenRegex s -> GenRegex s
delta inp@(uncons -> Just (c, inp')) e0
= d' e0
where
d' e@(Zero _) = e
d' e@(Star Dot) = e
d' e = delta inp' (
delta1 c inp e)
delta _empty e
= e
matchWithRegex :: (StringLike s) =>
GenRegex s -> s -> Bool
matchWithRegex e s
= nullable $ delta s e
matchWithRegex' :: (StringLike s) =>
GenRegex s -> s -> Maybe (SubexResults s)
matchWithRegex' e s
= (\ (r, l) -> if r then Just l else Nothing) . nullable' $ delta s e
splitWithRegex :: (StringLike s) =>
GenRegex s -> s -> Maybe (SubexResults s, s)
splitWithRegex re inp
= do
(re', rest) <- splitWithRegex' (mkBrN re) inp
return ( snd . nullable' $ re', rest)
splitWithRegexCS :: (StringLike s) =>
GenRegex s -> CharSet -> s -> Maybe (SubexResults s, s)
splitWithRegexCS re cs inp
= do
(re', rest) <- splitWithRegexCS' (mkBrN re) cs inp
return ( snd . nullable' $ re', rest)
splitWithRegex' :: (StringLike s) =>
GenRegex s -> s -> Maybe (GenRegex s, s)
splitWithRegex' re inp
= splitWithRegex''
( if nullable re
then Just (re, inp)
else Nothing
) re inp
splitWithRegex'' :: (StringLike s) =>
Maybe (GenRegex s, s) -> GenRegex s -> s -> Maybe (GenRegex s, s)
splitWithRegex'' lastRes re inp@(uncons -> Just (c, inp'))
| isZero re = lastRes
| otherwise = splitWithRegex'' nextRes re' $ inp'
where
re' = delta1 c inp re
nextRes
| nullable re' = Just (re', inp')
| otherwise = lastRes
splitWithRegex'' lastRes _re _empty
= lastRes
splitWithRegexCS' :: (StringLike s) =>
GenRegex s -> CharSet -> s -> Maybe (GenRegex s, s)
splitWithRegexCS' re cs inp@(uncons -> Just (c, _inp'))
| c `elemCS` cs = splitWithRegex' re inp
splitWithRegexCS' re _cs inp
| nullable re = Just (re, inp)
| otherwise = Nothing