-- Alfred-Margaret: Fast Aho-Corasick string searching -- Copyright 2019 Channable -- -- Licensed under the 3-clause BSD license, see the LICENSE file in the -- repository root. {-# LANGUAGE BangPatterns #-} {-# LANGUAGE CPP #-} {-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DerivingStrategies #-} -- | Implements sequential string replacements based on the Aho-Corasick algorithm. module Data.Text.AhoCorasick.Replacer ( -- * State machine Replacer (..) , build , compose , run , runWithLimit , Needle , Replacement , Payload (..) ) where import Control.DeepSeq (NFData) import Data.Hashable (Hashable) import Data.List (sort) import Data.Maybe (fromJust) import Data.Text (Text) import GHC.Generics (Generic) #if defined(HAS_AESON) import qualified Data.Aeson as AE #endif import qualified Data.Text as Text import Data.Text.AhoCorasick.Automaton (CaseSensitivity (..), CodeUnitIndex) import Data.Text.AhoCorasick.Searcher (Searcher) import qualified Data.Text.AhoCorasick.Automaton as Aho import qualified Data.Text.AhoCorasick.Searcher as Searcher import qualified Data.Text.Utf16 as Utf16 -- | Descriptive type alias for strings to search for. type Needle = Text -- | Descriptive type alias for replacements. type Replacement = Text -- | Priority of a needle. Higher integers indicate higher priorities. -- Replacement order is such that all matches of priority p are replaced before -- replacing any matches of priority q where p > q. type Priority = Int data Payload = Payload { needlePriority :: {-# UNPACK #-} !Priority , needleLength :: {-# UNPACK #-} !CodeUnitIndex , needleReplacement :: !Replacement } #if defined(HAS_AESON) deriving (Eq, Generic, Hashable, NFData, Show, AE.FromJSON, AE.ToJSON) #else deriving (Eq, Generic, Hashable, NFData, Show) #endif -- | A state machine used for efficient replacements with many different needles. data Replacer = Replacer { replacerCaseSensitivity :: CaseSensitivity , replacerSearcher :: Searcher Payload } deriving stock (Show, Eq, Generic) #if defined(HAS_AESON) deriving (Hashable, NFData, AE.FromJSON, AE.ToJSON) #else deriving (Hashable, NFData) #endif -- | Build an Aho-Corasick automaton that can be used for performing fast -- sequential replaces. -- -- Case-insensitive matching performs per-letter language-agnostic case folding. -- Therefore, it will work in most cases, but not in languages where case folding -- depends on the context of the character in question. -- -- We need to revisit this algorithm when we want to implement full Unicode -- support. build :: CaseSensitivity -> [(Needle, Replacement)] -> Replacer build caseSensitivity replaces = Replacer caseSensitivity searcher where searcher = Searcher.buildWithValues caseSensitivity $ zipWith mapNeedle [0..] replaces mapNeedle i (needle, replacement) = let needle' = case caseSensitivity of CaseSensitive -> needle IgnoreCase -> Utf16.lowerUtf16 needle in -- Note that we negate i: earlier needles have a higher priority. We -- could avoid it and define larger integers to be lower priority, but -- that made the terminology in this module very confusing. (needle', Payload (-i) (Utf16.lengthUtf16 needle') replacement) -- | Return the composition `replacer2` after `replacer1`, if they have the same -- case sensitivity. If the case sensitivity differs, Nothing is returned. compose :: Replacer -> Replacer -> Maybe Replacer compose (Replacer case1 searcher1) (Replacer case2 searcher2) | case1 /= case2 = Nothing | otherwise = let -- Replace the priorities of the second machine, so they all come after -- the first. renumber i (needle, Payload _ len replacement) = (needle, Payload (-i) len replacement) needles1 = Searcher.needles searcher1 needles2 = Searcher.needles searcher2 searcher = Searcher.buildWithValues case1 $ zipWith renumber [0..] (needles1 ++ needles2) in Just $ Replacer case1 searcher -- A match collected while running replacements. It is isomorphic to the Match -- reported by the automaton, but the data is arranged in a more useful way: -- as the start index and length of the match, and the replacement. data Match = Match !CodeUnitIndex !CodeUnitIndex !Text deriving (Eq, Ord, Show) -- | Apply replacements of all matches. Assumes that the matches are ordered by -- match position, and that no matches overlap. replace :: [Match] -> Text -> Text replace matches haystack = Text.concat $ go 0 matches haystack where -- At every match, cut the string into three pieces, removing the match. -- Because a Text is a buffer pointer and (offset, length), cutting does not -- involve string copies. Only at the very end we piece together the strings -- again, so Text can allocate a buffer of the right length and memcpy the -- parts into the new target string. -- If `k` is a code unit index into the original text, then `k - offset` -- is an index into `remainder`. In other words, `offset` is the index into -- the original text where `remainder` starts. go :: CodeUnitIndex -> [Match] -> Text -> [Text] go !_offset [] remainder = [remainder] go !offset ((Match pos len replacement) : ms) remainder = let (prefix, suffix) = Utf16.unsafeCutUtf16 (pos - offset) len remainder in prefix : replacement : go (pos + len) ms suffix -- | Compute the length of the string resulting from applying the replacements. replacementLength :: [Match] -> Text -> CodeUnitIndex replacementLength matches initial = go matches (Utf16.lengthUtf16 initial) where go [] !acc = acc go (Match _ matchLen repl : rest) !acc = go rest (acc - matchLen + Utf16.lengthUtf16 repl) -- | Given a list of matches sorted on start position, remove matches that start -- within an earlier match. removeOverlap :: [Match] -> [Match] removeOverlap matches = case matches of [] -> [] m:[] -> m:[] (m0@(Match pos0 len0 _) : m1@(Match pos1 _ _) : ms) -> if pos1 >= pos0 + len0 then m0 : removeOverlap (m1:ms) else removeOverlap (m0:ms) -- | When we iterate through all matches, keep track only of the matches with -- the highest priority: those are the ones that we will replace first. If we -- find multiple matches with that priority, remember all of them. If we find a -- match with lower priority, ignore it, because we already have a more -- important match. Also, if the priority is `threshold` or higher, ignore the -- match, so we can exclude matches if we already did a round of replacements -- for that priority. This way we don't have to build a new automaton after -- every round of replacements. {-# INLINE prependMatch #-} prependMatch :: Priority -> (Priority, [Match]) -> Aho.Match Payload -> Aho.Next (Priority, [Match]) prependMatch !threshold (!pBest, !matches) (Aho.Match pos (Payload pMatch len replacement)) | pMatch < threshold && pMatch > pBest = Aho.Step (pMatch, [Match (pos - len) len replacement]) | pMatch < threshold && pMatch == pBest = Aho.Step (pMatch, (Match (pos - len) len replacement) : matches) | otherwise = Aho.Step (pBest, matches) run :: Replacer -> Text -> Text run replacer = fromJust . runWithLimit replacer maxBound {-# NOINLINE runWithLimit #-} runWithLimit :: Replacer -> CodeUnitIndex -> Text -> Maybe Text runWithLimit (Replacer case_ searcher) maxLength = go initialThreshold where !automaton = Searcher.automaton searcher -- Priorities are 0 or lower, so an initial threshold of 1 keeps all -- matches. !initialThreshold = 1 -- Needle priorities go from 0 for the highest priority to (-numNeedles + 1) -- for the lowest priority. That means that if we find a match with -- minPriority, we don't need to do another pass afterwards, because there -- are no remaining needles. !minPriority = 1 - Searcher.numNeedles searcher go :: Priority -> Text -> Maybe Text go !threshold haystack = let seed = (minBound :: Priority, []) matchesWithPriority = case case_ of CaseSensitive -> Aho.runText seed (prependMatch threshold) automaton haystack IgnoreCase -> Aho.runLower seed (prependMatch threshold) automaton haystack in case matchesWithPriority of -- No match at the given threshold, there is nothing left to do. -- Return the input string unmodified. (_, []) -> Just haystack -- We found matches at priority p. Remove overlapping matches, then -- apply all replacements. Next, we need to go again, this time -- considering only needles with a lower priority than p. As an -- optimization (which matters mainly for the single needle case), -- if we find a match at the lowest priority, we don't need another -- pass. Note that if in `rawMatches` we find only matches of priority -- p > minPriority, then we do still need another pass, because the -- replacements could create new matches. (p, matches) | replacementLength matches haystack > maxLength -> Nothing | p == minPriority -> Just $ replace (removeOverlap $ sort matches) haystack | otherwise -> go p $ replace (removeOverlap $ sort matches) haystack