{-# LANGUAGE CPP #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE BinaryLiterals #-}
module Regex.KDE.Match
( matchRegex
) where
import qualified Data.ByteString as B
import Data.ByteString (ByteString)
import qualified Data.ByteString.UTF8 as U
import qualified Data.Set as Set
import Data.Set (Set)
import Regex.KDE.Regex
import qualified Data.IntMap.Strict as M
#if !MIN_VERSION_base(4,11,0)
import Data.Semigroup ((<>))
#endif
data Match =
Match { Match -> ByteString
matchBytes :: !ByteString
, Match -> Int
matchOffset :: !Int
, Match -> IntMap (Int, Int)
matchCaptures :: !(M.IntMap (Int, Int))
} deriving (Int -> Match -> ShowS
[Match] -> ShowS
Match -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Match] -> ShowS
$cshowList :: [Match] -> ShowS
show :: Match -> String
$cshow :: Match -> String
showsPrec :: Int -> Match -> ShowS
$cshowsPrec :: Int -> Match -> ShowS
Show, Match -> Match -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Match -> Match -> Bool
$c/= :: Match -> Match -> Bool
== :: Match -> Match -> Bool
$c== :: Match -> Match -> Bool
Eq)
instance Ord Match where
Match
m1 <= :: Match -> Match -> Bool
<= Match
m2
| Match -> Int
matchOffset Match
m1 forall a. Ord a => a -> a -> Bool
> Match -> Int
matchOffset Match
m2 = Bool
True
| Match -> Int
matchOffset Match
m1 forall a. Ord a => a -> a -> Bool
< Match -> Int
matchOffset Match
m2 = Bool
False
| Bool
otherwise = Match -> IntMap (Int, Int)
matchCaptures Match
m1 forall a. Ord a => a -> a -> Bool
>= Match -> IntMap (Int, Int)
matchCaptures Match
m2
mapMatching :: (Match -> Match) -> Set Match -> Set Match
mapMatching :: (Match -> Match) -> Set Match -> Set Match
mapMatching Match -> Match
f = forall a. (a -> Bool) -> Set a -> Set a
Set.filter ((forall a. Ord a => a -> a -> Bool
>= Int
0) forall b c a. (b -> c) -> (a -> b) -> a -> c
. Match -> Int
matchOffset) forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall b a. Ord b => (a -> b) -> Set a -> Set b
Set.map Match -> Match
f
sizeLimit :: Int
sizeLimit :: Int
sizeLimit = Int
2000
prune :: Set Match -> Set Match
prune :: Set Match -> Set Match
prune Set Match
ms = if forall a. Set a -> Int
Set.size Set Match
ms forall a. Ord a => a -> a -> Bool
> Int
sizeLimit
then forall a. Int -> Set a -> Set a
Set.take Int
sizeLimit Set Match
ms
else Set Match
ms
exec :: M.IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec :: IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
_ Direction
_ Regex
MatchNull = forall a. a -> a
id
exec IntMap Regex
cgs Direction
dir (Lazy Regex
re) =
IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
dir (Regex -> Regex -> Regex
MatchConcat (Regex -> Regex
Lazy Regex
re) Regex
MatchNull)
exec IntMap Regex
cgs Direction
dir (Possessive Regex
re) =
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr
(\Match
elt Set Match
s -> case forall a. Set a -> Maybe a
Set.lookupMin (IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
dir Regex
re (forall a. a -> Set a
Set.singleton Match
elt)) of
Maybe Match
Nothing -> Set Match
s
Just Match
m -> forall a. Ord a => a -> Set a -> Set a
Set.insert Match
m Set Match
s)
forall a. Monoid a => a
mempty
exec IntMap Regex
cgs Direction
dir (MatchDynamic Int
n) =
IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
dir ((Char -> Bool) -> Regex
MatchChar (forall a. Eq a => a -> a -> Bool
== Char
'%') forall a. Semigroup a => a -> a -> a
<>
forall a. Monoid a => [a] -> a
mconcat (forall a b. (a -> b) -> [a] -> [b]
map (\Char
c -> (Char -> Bool) -> Regex
MatchChar (forall a. Eq a => a -> a -> Bool
== Char
c)) (forall a. Show a => a -> String
show Int
n)))
exec IntMap Regex
_ Direction
_ Regex
AssertEnd = forall a. (a -> Bool) -> Set a -> Set a
Set.filter (\Match
m -> Match -> Int
matchOffset Match
m forall a. Eq a => a -> a -> Bool
== ByteString -> Int
B.length (Match -> ByteString
matchBytes Match
m))
exec IntMap Regex
_ Direction
_ Regex
AssertBeginning = forall a. (a -> Bool) -> Set a -> Set a
Set.filter (\Match
m -> Match -> Int
matchOffset Match
m forall a. Eq a => a -> a -> Bool
== Int
0)
exec IntMap Regex
cgs Direction
_ (AssertPositive Direction
dir Regex
regex) =
forall (f :: * -> *) a. (Foldable f, Ord a) => f (Set a) -> Set a
Set.unions forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall b a. Ord b => (a -> b) -> Set a -> Set b
Set.map
(\Match
m -> forall b a. Ord b => (a -> b) -> Set a -> Set b
Set.map (\Match
m' ->
Match
m'{ matchBytes :: ByteString
matchBytes = Match -> ByteString
matchBytes Match
m,
matchOffset :: Int
matchOffset = Match -> Int
matchOffset Match
m })
forall a b. (a -> b) -> a -> b
$ IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
dir Regex
regex (forall a. a -> Set a
Set.singleton Match
m))
exec IntMap Regex
cgs Direction
_ (AssertNegative Direction
dir Regex
regex) =
forall a. (a -> Bool) -> Set a -> Set a
Set.filter (\Match
m -> forall (t :: * -> *) a. Foldable t => t a -> Bool
null (IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
dir Regex
regex (forall a. a -> Set a
Set.singleton Match
m)))
exec IntMap Regex
_ Direction
_ Regex
AssertWordBoundary = forall a. (a -> Bool) -> Set a -> Set a
Set.filter Match -> Bool
atWordBoundary
exec IntMap Regex
_ Direction
Forward Regex
MatchAnyChar = (Match -> Match) -> Set Match -> Set Match
mapMatching forall a b. (a -> b) -> a -> b
$ \Match
m ->
case ByteString -> Maybe (Char, Int)
U.decode (Int -> ByteString -> ByteString
B.drop (Match -> Int
matchOffset Match
m) (Match -> ByteString
matchBytes Match
m)) of
Maybe (Char, Int)
Nothing -> Match
m{ matchOffset :: Int
matchOffset = - Int
1}
Just (Char
_,Int
n) -> Match
m{ matchOffset :: Int
matchOffset = Match -> Int
matchOffset Match
m forall a. Num a => a -> a -> a
+ Int
n }
exec IntMap Regex
_ Direction
Backward Regex
MatchAnyChar = (Match -> Match) -> Set Match -> Set Match
mapMatching forall a b. (a -> b) -> a -> b
$ \Match
m ->
case ByteString -> Int -> Maybe Int
lastCharOffset (Match -> ByteString
matchBytes Match
m) (Match -> Int
matchOffset Match
m) of
Maybe Int
Nothing -> Match
m{ matchOffset :: Int
matchOffset = -Int
1 }
Just Int
off -> Match
m{ matchOffset :: Int
matchOffset = Int
off }
exec IntMap Regex
_ Direction
Forward (MatchChar Char -> Bool
f) = (Match -> Match) -> Set Match -> Set Match
mapMatching forall a b. (a -> b) -> a -> b
$ \Match
m ->
case ByteString -> Maybe (Char, Int)
U.decode (Int -> ByteString -> ByteString
B.drop (Match -> Int
matchOffset Match
m) (Match -> ByteString
matchBytes Match
m)) of
Just (Char
c,Int
n) | Char -> Bool
f Char
c -> Match
m{ matchOffset :: Int
matchOffset = Match -> Int
matchOffset Match
m forall a. Num a => a -> a -> a
+ Int
n }
Maybe (Char, Int)
_ -> Match
m{ matchOffset :: Int
matchOffset = -Int
1 }
exec IntMap Regex
_ Direction
Backward (MatchChar Char -> Bool
f) = (Match -> Match) -> Set Match -> Set Match
mapMatching forall a b. (a -> b) -> a -> b
$ \Match
m ->
case ByteString -> Int -> Maybe Int
lastCharOffset (Match -> ByteString
matchBytes Match
m) (Match -> Int
matchOffset Match
m) of
Maybe Int
Nothing -> Match
m{ matchOffset :: Int
matchOffset = -Int
1 }
Just Int
off ->
case ByteString -> Maybe (Char, Int)
U.decode (Int -> ByteString -> ByteString
B.drop Int
off (Match -> ByteString
matchBytes Match
m)) of
Just (Char
c,Int
_) | Char -> Bool
f Char
c -> Match
m{ matchOffset :: Int
matchOffset = Int
off }
Maybe (Char, Int)
_ -> Match
m{ matchOffset :: Int
matchOffset = -Int
1 }
exec IntMap Regex
cgs Direction
dir (MatchConcat (MatchConcat Regex
r1 Regex
r2) Regex
r3) =
IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
dir (Regex -> Regex -> Regex
MatchConcat Regex
r1 (Regex -> Regex -> Regex
MatchConcat Regex
r2 Regex
r3))
exec IntMap Regex
cgs Direction
Forward (MatchConcat (Lazy Regex
r1) Regex
r2) =
forall a b. (a -> b -> a) -> a -> Set b -> a
Set.foldl forall a. Ord a => Set a -> Set a -> Set a
Set.union forall a. Monoid a => a
mempty forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall b a. Ord b => (a -> b) -> Set a -> Set b
Set.map
(\Match
m ->
let ms1 :: Set Match
ms1 = IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
Forward Regex
r1 (forall a. a -> Set a
Set.singleton Match
m)
in if forall a. Set a -> Bool
Set.null Set Match
ms1
then Set Match
ms1
else Set Match -> Set Match
go Set Match
ms1)
where
go :: Set Match -> Set Match
go Set Match
ms = case forall a. Set a -> Maybe a
Set.lookupMax Set Match
ms of
Maybe Match
Nothing -> forall a. Set a
Set.empty
Just Match
m' ->
let s' :: Set Match
s' = IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
Forward Regex
r2 (forall a. a -> Set a
Set.singleton Match
m')
in if forall a. Set a -> Bool
Set.null Set Match
s'
then Set Match -> Set Match
go (forall a. Ord a => a -> Set a -> Set a
Set.delete Match
m' Set Match
ms)
else Set Match
s'
exec IntMap Regex
cgs Direction
Forward (MatchConcat Regex
r1 Regex
r2) =
\Set Match
ms ->
let ms1 :: Set Match
ms1 = IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
Forward Regex
r1 Set Match
ms
in if forall a. Set a -> Bool
Set.null Set Match
ms1
then Set Match
ms1
else IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
Forward Regex
r2 (Set Match -> Set Match
prune Set Match
ms1)
exec IntMap Regex
cgs Direction
Backward (MatchConcat Regex
r1 Regex
r2) =
IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
Backward Regex
r1 forall b c a. (b -> c) -> (a -> b) -> a -> c
. IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
Backward Regex
r2
exec IntMap Regex
cgs Direction
dir (MatchAlt Regex
r1 Regex
r2) = \Set Match
ms -> IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
dir Regex
r1 Set Match
ms forall a. Semigroup a => a -> a -> a
<> IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
dir Regex
r2 Set Match
ms
exec IntMap Regex
cgs Direction
dir (MatchSome Regex
re) = Set Match -> Set Match
go
where
go :: Set Match -> Set Match
go Set Match
ms = case IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
dir Regex
re Set Match
ms of
Set Match
ms' | forall a. Set a -> Bool
Set.null Set Match
ms' -> forall a. Set a
Set.empty
| Set Match
ms' forall a. Eq a => a -> a -> Bool
== Set Match
ms -> Set Match
ms
| Bool
otherwise -> let ms'' :: Set Match
ms'' = Set Match -> Set Match
prune Set Match
ms'
in Set Match
ms'' forall a. Semigroup a => a -> a -> a
<> Set Match -> Set Match
go Set Match
ms''
exec IntMap Regex
cgs Direction
dir (MatchCapture Int
i Regex
re) =
forall a b. (a -> b -> b) -> b -> Set a -> b
Set.foldr forall a. Ord a => Set a -> Set a -> Set a
Set.union forall a. Set a
Set.empty forall b c a. (b -> c) -> (a -> b) -> a -> c
.
forall b a. Ord b => (a -> b) -> Set a -> Set b
Set.map (\Match
m ->
forall b a. Ord b => (a -> b) -> Set a -> Set b
Set.map (Match -> Match -> Match
captureDifference Match
m) (IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
dir Regex
re (forall a. a -> Set a
Set.singleton Match
m)))
where
captureDifference :: Match -> Match -> Match
captureDifference Match
m Match
m' =
let len :: Int
len = Match -> Int
matchOffset Match
m' forall a. Num a => a -> a -> a
- Match -> Int
matchOffset Match
m
in Match
m'{ matchCaptures :: IntMap (Int, Int)
matchCaptures = forall a. Int -> a -> IntMap a -> IntMap a
M.insert Int
i (Match -> Int
matchOffset Match
m, Int
len)
(Match -> IntMap (Int, Int)
matchCaptures Match
m') }
exec IntMap Regex
_ Direction
dir (MatchCaptured Int
n) = (Match -> Match) -> Set Match -> Set Match
mapMatching Match -> Match
matchCaptured
where
matchCaptured :: Match -> Match
matchCaptured Match
m =
case forall a. Int -> IntMap a -> Maybe a
M.lookup Int
n (Match -> IntMap (Int, Int)
matchCaptures Match
m) of
Just (Int
offset, Int
len) ->
let capture :: ByteString
capture = Int -> ByteString -> ByteString
B.take Int
len forall a b. (a -> b) -> a -> b
$ Int -> ByteString -> ByteString
B.drop Int
offset forall a b. (a -> b) -> a -> b
$ Match -> ByteString
matchBytes Match
m
in case Direction
dir of
Direction
Forward | ByteString -> ByteString -> Bool
B.isPrefixOf ByteString
capture
(Int -> ByteString -> ByteString
B.drop (Match -> Int
matchOffset Match
m) (Match -> ByteString
matchBytes Match
m))
-> Match
m{ matchOffset :: Int
matchOffset = Match -> Int
matchOffset Match
m forall a. Num a => a -> a -> a
+ ByteString -> Int
B.length ByteString
capture }
Direction
Backward | ByteString -> ByteString -> Bool
B.isSuffixOf ByteString
capture
(Int -> ByteString -> ByteString
B.take (Match -> Int
matchOffset Match
m) (Match -> ByteString
matchBytes Match
m))
-> Match
m{ matchOffset :: Int
matchOffset = Match -> Int
matchOffset Match
m forall a. Num a => a -> a -> a
- ByteString -> Int
B.length ByteString
capture }
Direction
_ -> Match
m{ matchOffset :: Int
matchOffset = -Int
1 }
Maybe (Int, Int)
Nothing -> Match
m{ matchOffset :: Int
matchOffset = -Int
1 }
exec IntMap Regex
cgs Direction
dir (Subroutine Int
i) =
case forall a. Int -> IntMap a -> Maybe a
M.lookup Int
i IntMap Regex
cgs of
Maybe Regex
Nothing -> forall a. a -> a
id
Just Regex
re' -> IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
dir Regex
re'
atWordBoundary :: Match -> Bool
atWordBoundary :: Match -> Bool
atWordBoundary Match
m =
case Match -> Int
matchOffset Match
m of
Int
0 -> Bool
True
Int
n | Int
n forall a. Eq a => a -> a -> Bool
== ByteString -> Int
B.length (Match -> ByteString
matchBytes Match
m) -> Bool
True
| Bool
otherwise ->
case ByteString -> Int -> Maybe Int
lastCharOffset (Match -> ByteString
matchBytes Match
m) (Match -> Int
matchOffset Match
m) of
Maybe Int
Nothing -> Bool
True
Just Int
off ->
case ByteString -> String
U.toString (Int -> ByteString -> ByteString
B.drop (Int
off forall a. Num a => a -> a -> a
- Int
1) (Match -> ByteString
matchBytes Match
m)) of
(Char
prev:Char
cur:Char
next:String
_) ->
(Char -> Bool
isWordChar Char
cur forall a. Eq a => a -> a -> Bool
/= Char -> Bool
isWordChar Char
next) Bool -> Bool -> Bool
||
(Char -> Bool
isWordChar Char
cur forall a. Eq a => a -> a -> Bool
/= Char -> Bool
isWordChar Char
prev)
String
_ -> Bool
True
lastCharOffset :: ByteString -> Int -> Maybe Int
lastCharOffset :: ByteString -> Int -> Maybe Int
lastCharOffset ByteString
_ Int
0 = forall a. Maybe a
Nothing
lastCharOffset ByteString
_ Int
1 = forall a. Maybe a
Nothing
lastCharOffset ByteString
bs Int
n =
case HasCallStack => ByteString -> Int -> Word8
B.index ByteString
bs (Int
n forall a. Num a => a -> a -> a
- Int
2) of
Word8
w | Word8
w forall a. Ord a => a -> a -> Bool
< Word8
0b10000000 -> forall a. a -> Maybe a
Just (Int
n forall a. Num a => a -> a -> a
- Int
1)
| Word8
w forall a. Ord a => a -> a -> Bool
>= Word8
0b11000000 -> forall a. a -> Maybe a
Just (Int
n forall a. Num a => a -> a -> a
- Int
1)
| Bool
otherwise -> ByteString -> Int -> Maybe Int
lastCharOffset ByteString
bs (Int
n forall a. Num a => a -> a -> a
- Int
1)
matchRegex :: Regex
-> ByteString
-> Maybe (ByteString, M.IntMap (Int, Int))
matchRegex :: Regex -> ByteString -> Maybe (ByteString, IntMap (Int, Int))
matchRegex Regex
re ByteString
bs =
let capturingGroups :: IntMap Regex
capturingGroups = Regex -> IntMap Regex
extractCapturingGroups Regex
re
in Match -> (ByteString, IntMap (Int, Int))
toResult forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall a. Set a -> Maybe a
Set.lookupMin
(IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
capturingGroups Direction
Forward Regex
re
(forall a. a -> Set a
Set.singleton (ByteString -> Int -> IntMap (Int, Int) -> Match
Match ByteString
bs Int
0 forall a. IntMap a
M.empty)))
where
toResult :: Match -> (ByteString, IntMap (Int, Int))
toResult Match
m = (Int -> ByteString -> ByteString
B.take (Match -> Int
matchOffset Match
m) (Match -> ByteString
matchBytes Match
m), (Match -> IntMap (Int, Int)
matchCaptures Match
m))
extractCapturingGroups :: Regex -> M.IntMap Regex
Regex
regex = forall a. Int -> a -> IntMap a
M.singleton Int
0 Regex
regex forall a. Semigroup a => a -> a -> a
<>
case Regex
regex of
MatchSome Regex
re -> Regex -> IntMap Regex
extractCapturingGroups Regex
re
MatchAlt Regex
re1 Regex
re2 ->
Regex -> IntMap Regex
extractCapturingGroups Regex
re1 forall a. Semigroup a => a -> a -> a
<> Regex -> IntMap Regex
extractCapturingGroups Regex
re2
MatchConcat Regex
re1 Regex
re2 ->
Regex -> IntMap Regex
extractCapturingGroups Regex
re1 forall a. Semigroup a => a -> a -> a
<> Regex -> IntMap Regex
extractCapturingGroups Regex
re2
MatchCapture Int
i Regex
re -> forall a. Int -> a -> IntMap a
M.singleton Int
i Regex
re
AssertPositive Direction
_ Regex
re -> Regex -> IntMap Regex
extractCapturingGroups Regex
re
AssertNegative Direction
_ Regex
re -> Regex -> IntMap Regex
extractCapturingGroups Regex
re
Possessive Regex
re -> Regex -> IntMap Regex
extractCapturingGroups Regex
re
Lazy Regex
re -> Regex -> IntMap Regex
extractCapturingGroups Regex
re
Regex
_ -> forall a. Monoid a => a
mempty