{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE DerivingStrategies #-}
module Data.Text.AhoCorasick.Replacer
(
Replacer (..)
, build
, compose
, run
, runWithLimit
, Needle
, Replacement
, Payload (..)
) where
import Control.DeepSeq (NFData)
import Data.Hashable (Hashable)
import Data.List (sort)
import Data.Maybe (fromJust)
import Data.Text (Text)
import GHC.Generics (Generic)
#if defined(HAS_AESON)
import qualified Data.Aeson as AE
#endif
import qualified Data.Text as Text
import Data.Text.AhoCorasick.Automaton (CaseSensitivity (..), CodeUnitIndex)
import Data.Text.AhoCorasick.Searcher (Searcher)
import qualified Data.Text.AhoCorasick.Automaton as Aho
import qualified Data.Text.AhoCorasick.Searcher as Searcher
import qualified Data.Text.Utf16 as Utf16
type Needle = Text
type Replacement = Text
type Priority = Int
data Payload = Payload
{ needlePriority :: {-# UNPACK #-} !Priority
, needleLength :: {-# UNPACK #-} !CodeUnitIndex
, needleReplacement :: !Replacement
}
#if defined(HAS_AESON)
deriving (Eq, Generic, Hashable, NFData, Show, AE.FromJSON, AE.ToJSON)
#else
deriving (Eq, Generic, Hashable, NFData, Show)
#endif
data Replacer = Replacer
{ replacerCaseSensitivity :: CaseSensitivity
, replacerSearcher :: Searcher Payload
}
deriving stock (Show, Eq, Generic)
#if defined(HAS_AESON)
deriving (Hashable, NFData, AE.FromJSON, AE.ToJSON)
#else
deriving (Hashable, NFData)
#endif
build :: CaseSensitivity -> [(Needle, Replacement)] -> Replacer
build caseSensitivity replaces = Replacer caseSensitivity searcher
where
searcher = Searcher.buildWithValues caseSensitivity $ zipWith mapNeedle [0..] replaces
mapNeedle i (needle, replacement) =
let
needle' = case caseSensitivity of
CaseSensitive -> needle
IgnoreCase -> Utf16.lowerUtf16 needle
in
(needle', Payload (-i) (Utf16.lengthUtf16 needle') replacement)
compose :: Replacer -> Replacer -> Maybe Replacer
compose (Replacer case1 searcher1) (Replacer case2 searcher2)
| case1 /= case2 = Nothing
| otherwise =
let
renumber i (needle, Payload _ len replacement) = (needle, Payload (-i) len replacement)
needles1 = Searcher.needles searcher1
needles2 = Searcher.needles searcher2
searcher = Searcher.buildWithValues case1 $ zipWith renumber [0..] (needles1 ++ needles2)
in
Just $ Replacer case1 searcher
data Match = Match !CodeUnitIndex !CodeUnitIndex !Text deriving (Eq, Ord, Show)
replace :: [Match] -> Text -> Text
replace matches haystack = Text.concat $ go 0 matches haystack
where
go :: CodeUnitIndex -> [Match] -> Text -> [Text]
go !_offset [] remainder = [remainder]
go !offset ((Match pos len replacement) : ms) remainder =
let
(prefix, suffix) = Utf16.unsafeCutUtf16 (pos - offset) len remainder
in
prefix : replacement : go (pos + len) ms suffix
replacementLength :: [Match] -> Text -> CodeUnitIndex
replacementLength matches initial = go matches (Utf16.lengthUtf16 initial)
where
go [] !acc = acc
go (Match _ matchLen repl : rest) !acc = go rest (acc - matchLen + Utf16.lengthUtf16 repl)
removeOverlap :: [Match] -> [Match]
removeOverlap matches = case matches of
[] -> []
m:[] -> m:[]
(m0@(Match pos0 len0 _) : m1@(Match pos1 _ _) : ms) ->
if pos1 >= pos0 + len0
then m0 : removeOverlap (m1:ms)
else removeOverlap (m0:ms)
{-# INLINE prependMatch #-}
prependMatch :: Priority -> (Priority, [Match]) -> Aho.Match Payload -> Aho.Next (Priority, [Match])
prependMatch !threshold (!pBest, !matches) (Aho.Match pos (Payload pMatch len replacement))
| pMatch < threshold && pMatch > pBest = Aho.Step (pMatch, [Match (pos - len) len replacement])
| pMatch < threshold && pMatch == pBest = Aho.Step (pMatch, (Match (pos - len) len replacement) : matches)
| otherwise = Aho.Step (pBest, matches)
run :: Replacer -> Text -> Text
run replacer = fromJust . runWithLimit replacer maxBound
{-# NOINLINE runWithLimit #-}
runWithLimit :: Replacer -> CodeUnitIndex -> Text -> Maybe Text
runWithLimit (Replacer case_ searcher) maxLength = go initialThreshold
where
!automaton = Searcher.automaton searcher
!initialThreshold = 1
!minPriority = 1 - Searcher.numNeedles searcher
go :: Priority -> Text -> Maybe Text
go !threshold haystack =
let
seed = (minBound :: Priority, [])
matchesWithPriority = case case_ of
CaseSensitive -> Aho.runText seed (prependMatch threshold) automaton haystack
IgnoreCase -> Aho.runLower seed (prependMatch threshold) automaton haystack
in
case matchesWithPriority of
(_, []) -> Just haystack
(p, matches)
| replacementLength matches haystack > maxLength -> Nothing
| p == minPriority -> Just $ replace (removeOverlap $ sort matches) haystack
| otherwise -> go p $ replace (removeOverlap $ sort matches) haystack