module Text.XML.HXT.Arrow.XmlRegex
( XmlRegex
, mkZero
, mkUnit
, mkPrim
, mkPrim'
, mkPrimA
, mkDot
, mkStar
, mkAlt
, mkAlts
, mkSeq
, mkSeqs
, mkRep
, mkRng
, mkOpt
, mkPerm
, mkPerms
, mkMerge
, nullable
, delta
, matchXmlRegex
, splitXmlRegex
, scanXmlRegex
, matchRegexA
, splitRegexA
, scanRegexA
)
where
import Control.Arrow.ListArrows
import Data.Maybe
import Text.XML.HXT.DOM.Interface
import Text.XML.HXT.DOM.ShowXml (xshow)
matchRegexA :: XmlRegex -> LA XmlTree XmlTree -> LA XmlTree XmlTrees
matchRegexA re ts = ts >>. (\ s -> maybe [s] (const []) . matchXmlRegex re $ s)
splitRegexA :: XmlRegex -> LA XmlTree XmlTree -> LA XmlTree (XmlTrees, XmlTrees)
splitRegexA re ts = ts >>. (maybeToList . splitXmlRegex re)
scanRegexA :: XmlRegex -> LA XmlTree XmlTree -> LA XmlTree XmlTrees
scanRegexA re ts = ts >>. (fromMaybe [] . scanXmlRegex re)
data XmlRegex = Zero String
| Unit
| Sym (XmlTree -> Bool) String
| Dot
| Star XmlRegex
| Alt XmlRegex XmlRegex
| Seq XmlRegex XmlRegex
| Rep Int XmlRegex
| Rng Int Int XmlRegex
| Perm XmlRegex XmlRegex
| Merge XmlRegex XmlRegex
mkZero :: String -> XmlRegex
mkZero = Zero
mkUnit :: XmlRegex
mkUnit = Unit
mkPrim :: (XmlTree -> Bool) -> XmlRegex
mkPrim p = Sym p ""
mkPrim' :: (XmlTree -> Bool) -> String -> XmlRegex
mkPrim' = Sym
mkPrimA :: LA XmlTree XmlTree -> XmlRegex
mkPrimA a = mkPrim (not . null . runLA a)
mkDot :: XmlRegex
mkDot = Dot
mkStar :: XmlRegex -> XmlRegex
mkStar (Zero _) = mkUnit
mkStar e@Unit = e
mkStar e@(Star _e1) = e
mkStar (Rep 1 e1) = mkStar e1
mkStar e@(Alt _ _) = Star (rmStar e)
mkStar e = Star e
rmStar :: XmlRegex -> XmlRegex
rmStar (Alt e1 e2) = mkAlt (rmStar e1) (rmStar e2)
rmStar (Star e1) = rmStar e1
rmStar (Rep 1 e1) = rmStar e1
rmStar e1 = e1
mkAlt :: XmlRegex -> XmlRegex -> XmlRegex
mkAlt e1 (Zero _) = e1
mkAlt (Zero _) e2 = e2
mkAlt e1@(Star Dot) _e2 = e1
mkAlt _e1 e2@(Star Dot) = e2
mkAlt (Sym p1 e1) (Sym p2 e2) = mkPrim' (\ x -> p1 x || p2 x) (e e1 e2)
where
e "" x2 = x2
e x1 "" = x1
e x1 x2 = x1 ++ "|" ++ x2
mkAlt e1 e2@(Sym _ _) = mkAlt e2 e1
mkAlt e1@(Sym _ _) (Alt e2@(Sym _ _) e3)
= mkAlt (mkAlt e1 e2) e3
mkAlt (Alt e1 e2) e3 = mkAlt e1 (mkAlt e2 e3)
mkAlt e1 e2 = Alt e1 e2
mkAlts :: [XmlRegex] -> XmlRegex
mkAlts = foldr mkAlt (mkZero "")
mkSeq :: XmlRegex -> XmlRegex -> XmlRegex
mkSeq e1@(Zero _) _e2 = e1
mkSeq _e1 e2@(Zero _) = e2
mkSeq Unit e2 = e2
mkSeq e1 Unit = e1
mkSeq (Seq e1 e2) e3 = mkSeq e1 (mkSeq e2 e3)
mkSeq e1 e2 = Seq e1 e2
mkSeqs :: [XmlRegex] -> XmlRegex
mkSeqs = foldr mkSeq mkUnit
mkRep :: Int -> XmlRegex -> XmlRegex
mkRep 0 e = mkStar e
mkRep _ e@(Zero _) = e
mkRep _ e@Unit = e
mkRep i e = Rep i e
mkRng :: Int -> Int -> XmlRegex -> XmlRegex
mkRng 0 0 _e = mkUnit
mkRng 1 1 e = e
mkRng lb ub _e
| lb > ub = Zero $
"illegal range " ++
show lb ++ ".." ++ show ub
mkRng _l _u e@(Zero _) = e
mkRng _l _u e@Unit = e
mkRng lb ub e = Rng lb ub e
mkOpt :: XmlRegex -> XmlRegex
mkOpt = mkRng 0 1
mkPerm :: XmlRegex -> XmlRegex -> XmlRegex
mkPerm e1@(Zero _) _ = e1
mkPerm _ e2@(Zero _) = e2
mkPerm Unit e2 = e2
mkPerm e1 Unit = e1
mkPerm e1 e2 = Perm e1 e2
mkPerms :: [XmlRegex] -> XmlRegex
mkPerms = foldr mkPerm mkUnit
mkMerge :: XmlRegex -> XmlRegex -> XmlRegex
mkMerge e1@(Zero _) _ = e1
mkMerge _ e2@(Zero _) = e2
mkMerge Unit e2 = e2
mkMerge e1 Unit = e1
mkMerge e1 e2 = Merge e1 e2
instance Show XmlRegex where
show (Zero s) = "{err:" ++ s ++ "}"
show Unit = "()"
show (Sym _p "") = "<pred>"
show (Sym _p r ) = r
show Dot = "."
show (Star e) = "(" ++ show e ++ ")*"
show (Alt e1 e2) = "(" ++ show e1 ++ "|" ++ show e2 ++ ")"
show (Seq e1 e2) = show e1 ++ show e2
show (Rep 1 e) = "(" ++ show e ++ ")+"
show (Rep i e) = "(" ++ show e ++ "){" ++ show i ++ ",}"
show (Rng 0 1 e) = "(" ++ show e ++ ")?"
show (Rng i j e) = "(" ++ show e ++ "){" ++ show i ++ "," ++ show j ++ "}"
show (Perm e1 e2) = "(" ++ show e1 ++ show e2 ++ "|" ++ show e2 ++ show e1 ++ ")"
show (Merge e1 e2) = "(" ++ show e1 ++ "&" ++ show e2 ++ ")"
unexpected :: XmlTree -> String -> String
unexpected t e = emsg e ++ (cut 80 . xshow) [t]
where
emsg "" = "unexpected: "
emsg s = "expected: " ++ s ++ ", but got: "
cut n s
| null rest = s'
| otherwise = s' ++ "..."
where
(s', rest) = splitAt n s
nullable :: XmlRegex -> Bool
nullable (Zero _) = False
nullable Unit = True
nullable (Sym _p _) = False
nullable Dot = False
nullable (Star _) = True
nullable (Alt e1 e2) = nullable e1 ||
nullable e2
nullable (Seq e1 e2) = nullable e1 &&
nullable e2
nullable (Rep _i e) = nullable e
nullable (Rng i _ e) = i == 0 ||
nullable e
nullable (Perm e1 e2) = nullable e1 &&
nullable e2
nullable (Merge e1 e2) = nullable e1 &&
nullable e2
delta :: XmlRegex -> XmlTree -> XmlRegex
delta e@(Zero _) _ = e
delta Unit c = mkZero $ unexpected c ""
delta (Sym p e) c
| p c = mkUnit
| otherwise = mkZero $ unexpected c e
delta Dot _ = mkUnit
delta e@(Star e1) c = mkSeq (delta e1 c) e
delta (Alt e1 e2) c = mkAlt (delta e1 c) (delta e2 c)
delta (Seq e1 e2) c
| nullable e1 = mkAlt (mkSeq (delta e1 c) e2) (delta e2 c)
| otherwise = mkSeq (delta e1 c) e2
delta (Rep i e) c = mkSeq (delta e c) (mkRep (i1) e)
delta (Rng i j e) c = mkSeq (delta e c) (mkRng ((i1) `max` 0) (j1) e)
delta (Perm e1 e2) c = case e1' of
(Zero _) -> mkPerm e1 (delta e2 c)
_ -> mkPerm e1' e2
where
e1' = delta e1 c
delta (Merge e1 e2) c = mkAlt (mkMerge (delta e1 c) e2)
(mkMerge e1 (delta e2 c))
delta' :: XmlRegex -> XmlTrees -> XmlRegex
delta' = foldl delta
matchXmlRegex :: XmlRegex -> XmlTrees -> Maybe String
matchXmlRegex e
= res . delta' e
where
res (Zero er) = Just er
res re
| nullable re = Nothing
| otherwise = Just $ "input does not match " ++ show e
splitXmlRegex :: XmlRegex -> XmlTrees -> Maybe (XmlTrees, XmlTrees)
splitXmlRegex re = splitXmlRegex' re []
splitXmlRegex' :: XmlRegex -> XmlTrees -> XmlTrees -> Maybe (XmlTrees, XmlTrees)
splitXmlRegex' re res []
| nullable re = Just (reverse res, [])
| otherwise = Nothing
splitXmlRegex' (Zero _) _ _
= Nothing
splitXmlRegex' re res xs@(x:xs')
| isJust res' = res'
| nullable re = Just (reverse res, xs)
| otherwise = Nothing
where
re' = delta re x
res' = splitXmlRegex' re' (x:res) xs'
scanXmlRegex :: XmlRegex -> XmlTrees -> Maybe [XmlTrees]
scanXmlRegex re ts = scanXmlRegex' re (splitXmlRegex re ts)
scanXmlRegex' :: XmlRegex -> Maybe (XmlTrees, XmlTrees) -> Maybe [XmlTrees]
scanXmlRegex' _ Nothing = Nothing
scanXmlRegex' _ (Just (rs, [])) = Just [rs]
scanXmlRegex' _ (Just ([], _)) = Nothing
scanXmlRegex' re (Just (rs, rest))
| isNothing res = Nothing
| otherwise = Just (rs : fromJust res)
where
res = scanXmlRegex' re (splitXmlRegex re rest)