{-# LANGUAGE CPP #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE BinaryLiterals #-}
module Regex.KDE.Match
( matchRegex
) where
import qualified Data.ByteString as B
import Data.ByteString (ByteString)
import qualified Data.ByteString.UTF8 as U
import qualified Data.Set as Set
import Data.Set (Set)
import Regex.KDE.Regex
import qualified Data.IntMap.Strict as M
#if !MIN_VERSION_base(4,11,0)
import Data.Semigroup ((<>))
#endif
data Match =
Match { matchBytes :: !ByteString
, matchOffset :: !Int
, matchCaptures :: !(M.IntMap (Int, Int))
} deriving (Show, Eq)
instance Ord Match where
m1 <= m2
| matchOffset m1 > matchOffset m2 = True
| matchOffset m1 < matchOffset m2 = False
| otherwise = matchCaptures m1 >= matchCaptures m2
mapMatching :: (Match -> Match) -> Set Match -> Set Match
mapMatching f = Set.filter ((>= 0) . matchOffset) . Set.map f
sizeLimit :: Int
sizeLimit = 2000
prune :: Set Match -> Set Match
prune ms = if Set.size ms > sizeLimit
then Set.take sizeLimit ms
else ms
exec :: Regex -> Direction -> Regex -> Set Match -> Set Match
exec _ _ MatchNull = id
exec top dir (Lazy re) =
exec top dir (MatchConcat (Lazy re) MatchNull)
exec top dir (Possessive re) =
foldr
(\elt s -> case Set.lookupMin (exec top dir re (Set.singleton elt)) of
Nothing -> s
Just m -> Set.insert m s)
mempty
exec top dir (MatchDynamic n) =
exec top dir (MatchChar (== '%') <>
mconcat (map (\c -> MatchChar (== c)) (show n)))
exec _ _ AssertEnd = Set.filter (\m -> matchOffset m == B.length (matchBytes m))
exec _ _ AssertBeginning = Set.filter (\m -> matchOffset m == 0)
exec top _ (AssertPositive dir regex) =
Set.filter (\m -> not (null (exec top dir regex (Set.singleton m))))
exec top _ (AssertNegative dir regex) =
Set.filter (\m -> null (exec top dir regex (Set.singleton m)))
exec _ _ AssertWordBoundary = Set.filter atWordBoundary
exec _ Forward MatchAnyChar = mapMatching $ \m ->
case U.decode (B.drop (matchOffset m) (matchBytes m)) of
Nothing -> m{ matchOffset = - 1}
Just (_,n) -> m{ matchOffset = matchOffset m + n }
exec _ Backward MatchAnyChar = mapMatching $ \m ->
case lastCharOffset (matchBytes m) (matchOffset m) of
Nothing -> m{ matchOffset = -1 }
Just off -> m{ matchOffset = off }
exec _ Forward (MatchChar f) = mapMatching $ \m ->
case U.decode (B.drop (matchOffset m) (matchBytes m)) of
Just (c,n) | f c -> m{ matchOffset = matchOffset m + n }
_ -> m{ matchOffset = -1 }
exec _ Backward (MatchChar f) = mapMatching $ \m ->
case lastCharOffset (matchBytes m) (matchOffset m) of
Nothing -> m{ matchOffset = -1 }
Just off ->
case U.decode (B.drop off (matchBytes m)) of
Just (c,_) | f c -> m{ matchOffset = off }
_ -> m{ matchOffset = -1 }
exec top dir (MatchConcat (MatchConcat r1 r2) r3) =
exec top dir (MatchConcat r1 (MatchConcat r2 r3))
exec top Forward (MatchConcat (Lazy r1) r2) =
Set.foldl Set.union mempty . Set.map
(\m ->
let ms1 = exec top Forward r1 (Set.singleton m)
in if Set.null ms1
then ms1
else go ms1)
where
go ms = case Set.lookupMax ms of
Nothing -> Set.empty
Just m' ->
let s' = exec top Forward r2 (Set.singleton m')
in if Set.null s'
then go (Set.delete m' ms)
else s'
exec top Forward (MatchConcat r1 r2) =
\ms ->
let ms1 = exec top Forward r1 ms
in if Set.null ms1
then ms1
else exec top Forward r2 (prune ms1)
exec top Backward (MatchConcat r1 r2) =
exec top Backward r1 . exec top Backward r2
exec top dir (MatchAlt r1 r2) = \ms -> exec top dir r1 ms <> exec top dir r2 ms
exec top dir (MatchSome re) = go
where
go ms = case exec top dir re ms of
ms' | Set.null ms' -> Set.empty
| ms' == ms -> ms
| otherwise -> let ms'' = prune ms'
in ms'' <> go ms''
exec top dir (MatchCapture i re) =
Set.foldr Set.union Set.empty .
Set.map (\m ->
Set.map (captureDifference m) (exec top dir re (Set.singleton m)))
where
captureDifference m m' =
let len = matchOffset m' - matchOffset m
in m'{ matchCaptures = M.insert i (matchOffset m, len)
(matchCaptures m') }
exec _ dir (MatchCaptured n) = mapMatching matchCaptured
where
matchCaptured m =
case M.lookup n (matchCaptures m) of
Just (offset, len) ->
let capture = B.take len $ B.drop offset $ matchBytes m
in case dir of
Forward | B.isPrefixOf capture
(B.drop (matchOffset m) (matchBytes m))
-> m{ matchOffset = matchOffset m + B.length capture }
Backward | B.isSuffixOf capture
(B.take (matchOffset m) (matchBytes m))
-> m{ matchOffset = matchOffset m - B.length capture }
_ -> m{ matchOffset = -1 }
Nothing -> m{ matchOffset = -1 }
exec top dir Recurse = \ms -> if Set.null ms
then ms
else exec top dir top ms
atWordBoundary :: Match -> Bool
atWordBoundary m =
case matchOffset m of
0 -> True
n | n == B.length (matchBytes m) -> True
| otherwise ->
case lastCharOffset (matchBytes m) (matchOffset m) of
Nothing -> True
Just off ->
case U.toString (B.drop (off - 1) (matchBytes m)) of
(prev:cur:next:_) ->
(isWordChar cur /= isWordChar next) ||
(isWordChar cur /= isWordChar prev)
_ -> True
lastCharOffset :: ByteString -> Int -> Maybe Int
lastCharOffset _ 0 = Nothing
lastCharOffset _ 1 = Nothing
lastCharOffset bs n =
case B.index bs (n - 2) of
w | w < 0b10000000 -> Just (n - 1)
| w >= 0b11000000 -> Just (n - 1)
| otherwise -> lastCharOffset bs (n - 1)
matchRegex :: Regex
-> ByteString
-> Maybe (ByteString, M.IntMap (Int, Int))
matchRegex re bs =
toResult <$> Set.lookupMin
(exec re Forward re (Set.singleton (Match bs 0 M.empty)))
where
toResult m = (B.take (matchOffset m) (matchBytes m), (matchCaptures m))