-- Alfred-Margaret: Fast Aho-Corasick string searching -- Copyright 2019 Channable -- -- Licensed under the 3-clause BSD license, see the LICENSE file in the -- repository root. {-# LANGUAGE BangPatterns #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE FlexibleInstances #-} module Data.Text.BoyerMoore.Searcher ( Searcher , build , buildWithValues , needles , numNeedles , automata , caseSensitivity , containsAny , setSearcherCaseSensitivity ) where import Control.DeepSeq (NFData) import Data.Bifunctor (first) import Data.Hashable (Hashable (hashWithSalt), Hashed, hashed, unhashed) import Data.Text (Text) import GHC.Generics (Generic) import Data.Text.BoyerMoore.Automaton (Automaton, CaseSensitivity (..)) import qualified Data.Text.BoyerMoore.Automaton as BoyerMoore -- | A set of needles with associated values, and Boyer-Moore automata to -- efficiently find those needles. -- -- INVARIANT: searcherAutomaton = BoyerMoore.buildAutomaton . searcherNeedles -- To enforce this invariant, the fields are not exposed from this module. -- There is a separate constructor function. -- -- The purpose of this wrapper is to have a type that is Hashable and Eq, so we -- can derive those for the types that embed the searcher, whithout -- requiring the automaton itself to be Hashable or Eq, which would be both -- wasteful and tedious. Because the automaton is fully determined by the -- needles and associated values, it is sufficient to implement Eq and Hashable -- in terms of the needles only. -- -- We also use Hashed to cache the hash of the needles. data Searcher v = Searcher { searcherCaseSensitive :: CaseSensitivity , searcherNeedles :: Hashed [(Text, v)] , searcherNumNeedles :: Int , searcherAutomata :: [(Automaton, v)] } deriving (Generic) instance Show (Searcher v) where show _ = "Searcher _ _ _" instance Hashable v => Hashable (Searcher v) where hashWithSalt salt searcher = hashWithSalt salt $ searcherNeedles searcher {-# INLINE hashWithSalt #-} instance Eq v => Eq (Searcher v) where Searcher cx xs nx _ == Searcher cy ys ny _ = (cx, nx, xs) == (cy, ny, ys) {-# INLINE (==) #-} instance NFData v => NFData (Searcher v) -- | Builds the Searcher for a list of needles -- The caller is responsible that the needles are lower case in case the IgnoreCase -- is used for case sensitivity build :: CaseSensitivity -> [Text] -> Searcher () {-# INLINABLE build #-} build case_ = buildWithValues case_ . flip zip (repeat ()) -- | The caller is responsible that the needles are lower case in case the IgnoreCase -- is used for case sensitivity buildWithValues :: Hashable v => CaseSensitivity -> [(Text, v)] -> Searcher v {-# INLINABLE buildWithValues #-} buildWithValues case_ ns = Searcher case_ (hashed ns) (length ns) $ map (first BoyerMoore.buildAutomaton) ns needles :: Searcher v -> [(Text, v)] needles = unhashed . searcherNeedles automata :: Searcher v -> [(Automaton, v)] automata = searcherAutomata numNeedles :: Searcher v -> Int numNeedles = searcherNumNeedles caseSensitivity :: Searcher v -> CaseSensitivity caseSensitivity = searcherCaseSensitive -- | Updates the case sensitivity of the searcher. Does not change the -- capitilization of the needles. The caller should be certain that if IgnoreCase -- is passed, the needles are already lower case. setSearcherCaseSensitivity :: CaseSensitivity -> Searcher v -> Searcher v setSearcherCaseSensitivity case_ searcher = searcher{ searcherCaseSensitive = case_ } -- | Return whether the haystack contains any of the needles. -- Case sensitivity depends on the properties of the searcher -- This function is marked noinline as an inlining boundary. BoyerMoore.runText is -- marked inline, so this function will be optimized to report only whether -- there is a match, and not construct a list of matches. We don't want this -- function be inline, to make sure that the conditions of the caller don't -- affect how this function is optimized. There is little to gain from -- additional inlining. The pragma is not an optimization in itself, rather it -- is a defence against fragile optimizer decisions. {-# NOINLINE containsAny #-} containsAny :: Searcher () -> Text -> Bool containsAny !searcher !text = let -- On the first match, return True immediately. f _acc _match = BoyerMoore.Done True in case caseSensitivity searcher of CaseSensitive -> any (\(automaton, ()) -> BoyerMoore.runText False f automaton text) (automata searcher) IgnoreCase -> any (\(automaton, ()) -> BoyerMoore.runLower False f automaton text) (automata searcher)