-- Alfred-Margaret: Fast Aho-Corasick string searching
-- Copyright 2022 Channable
--
-- Licensed under the 3-clause BSD license, see the LICENSE file in the
-- repository root.

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TupleSections #-}

module Data.Text.AhoCorasick.Searcher
    ( Searcher
    , automaton
    , build
    , buildNeedleIdSearcher
    , buildWithValues
    , caseSensitivity
    , containsAll
    , containsAny
    , mapSearcher
    , needles
    , numNeedles
    , setCaseSensitivity
    ) where

import Control.DeepSeq (NFData)
import Data.Bifunctor (second)
import Data.Hashable (Hashable (hashWithSalt), Hashed, hashed, unhashed)
import GHC.Generics (Generic)

#if defined(HAS_AESON)
import Data.Aeson ((.:), (.=))
import qualified Data.Aeson as AE
#endif

import qualified Data.IntSet as IS

import Data.Text.CaseSensitivity (CaseSensitivity (..))
import Data.Text.Utf8 (Text)

import qualified Data.Text.AhoCorasick.Automaton as Aho

-- | A set of needles with associated values, and an Aho-Corasick automaton to
-- efficiently find those needles.
--
-- INVARIANT: searcherAutomaton = Aho.build . searcherNeedles
-- To enforce this invariant, the fields are not exposed from this module.
-- There is a separate constructor function.
--
-- The purpose of this wrapper is to have a type that is Hashable and Eq, so we
-- can derive those for types that embed the searcher, whithout requiring the
-- automaton itself to be Hashable or Eq, which would be both wasteful and
-- tedious. Because the automaton is fully determined by the needles and
-- associated values, it is sufficient to implement Eq and Hashable in terms of
-- the needles only.
--
-- We also use Hashed to cache the hash of the needles.
data Searcher v = Searcher
  { forall v. Searcher v -> CaseSensitivity
searcherCaseSensitive :: CaseSensitivity
  , forall v. Searcher v -> Hashed [(Text, v)]
searcherNeedles :: Hashed [(Text, v)]
  , forall v. Searcher v -> Int
searcherNumNeedles :: Int
  , forall v. Searcher v -> AcMachine v
searcherAutomaton :: Aho.AcMachine v
  } deriving (forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
forall v x. Rep (Searcher v) x -> Searcher v
forall v x. Searcher v -> Rep (Searcher v) x
$cto :: forall v x. Rep (Searcher v) x -> Searcher v
$cfrom :: forall v x. Searcher v -> Rep (Searcher v) x
Generic)

#if defined(HAS_AESON)
instance AE.ToJSON v => AE.ToJSON (Searcher v) where
  toJSON :: Searcher v -> Value
toJSON Searcher v
s = [Pair] -> Value
AE.object
    [ Key
"needles" forall kv v. (KeyValue kv, ToJSON v) => Key -> v -> kv
.= forall v. Searcher v -> [(Text, v)]
needles Searcher v
s
    , Key
"caseSensitivity" forall kv v. (KeyValue kv, ToJSON v) => Key -> v -> kv
.= forall v. Searcher v -> CaseSensitivity
caseSensitivity Searcher v
s
    ]

instance (Hashable v, AE.FromJSON v) => AE.FromJSON (Searcher v) where
  parseJSON :: Value -> Parser (Searcher v)
parseJSON = forall a. String -> (Object -> Parser a) -> Value -> Parser a
AE.withObject String
"Searcher" forall a b. (a -> b) -> a -> b
$ \Object
o -> forall v.
Hashable v =>
CaseSensitivity -> [(Text, v)] -> Searcher v
buildWithValues forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Object
o forall a. FromJSON a => Object -> Key -> Parser a
.: Key
"caseSensitivity" forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> Object
o forall a. FromJSON a => Object -> Key -> Parser a
.: Key
"needles"
#endif

instance Show (Searcher v) where
  show :: Searcher v -> String
show Searcher v
_ = String
"Searcher _ _ _"

instance Hashable v => Hashable (Searcher v) where
  hashWithSalt :: Int -> Searcher v -> Int
hashWithSalt Int
salt Searcher v
searcher = forall a. Hashable a => Int -> a -> Int
hashWithSalt Int
salt forall a b. (a -> b) -> a -> b
$ forall v. Searcher v -> Hashed [(Text, v)]
searcherNeedles Searcher v
searcher
  {-# INLINE hashWithSalt #-}

instance Eq v => Eq (Searcher v) where
  -- Since we store the length of the needle list anyway,
  -- we can use it to early out if there is a length mismatch.
  Searcher CaseSensitivity
cx Hashed [(Text, v)]
xs Int
nx AcMachine v
_ == :: Searcher v -> Searcher v -> Bool
== Searcher CaseSensitivity
cy Hashed [(Text, v)]
ys Int
ny AcMachine v
_ = (Int
nx, Hashed [(Text, v)]
xs, CaseSensitivity
cx) forall a. Eq a => a -> a -> Bool
== (Int
ny, Hashed [(Text, v)]
ys, CaseSensitivity
cy)
  {-# INLINE (==) #-}

instance NFData v => NFData (Searcher v)

-- NOTE: Although we could implement Semigroup for every v by just concatenating
-- needle lists, we don't, because this might lead to unexpected results. For
-- example, if v is (Int, a) where the Int is a priority, combining two
-- searchers might want to discard priorities, concatenate the needle lists, and
-- reassign priorities, rather than concatenating the needle lists as-is and
-- possibly having duplicate priorities in the resulting searcher.
instance Semigroup (Searcher ()) where
  Searcher ()
x <> :: Searcher () -> Searcher () -> Searcher ()
<> Searcher ()
y
    | forall v. Searcher v -> CaseSensitivity
caseSensitivity Searcher ()
x forall a. Eq a => a -> a -> Bool
== forall v. Searcher v -> CaseSensitivity
caseSensitivity Searcher ()
y
      = forall v.
Hashable v =>
CaseSensitivity -> [(Text, v)] -> Searcher v
buildWithValues (forall v. Searcher v -> CaseSensitivity
searcherCaseSensitive Searcher ()
x) (forall v. Searcher v -> [(Text, v)]
needles Searcher ()
x forall a. Semigroup a => a -> a -> a
<> forall v. Searcher v -> [(Text, v)]
needles Searcher ()
y)
    | Bool
otherwise = forall a. HasCallStack => String -> a
error String
"Combining searchers of different case sensitivity"
  {-# INLINE (<>) #-}

-- | Builds the Searcher for a list of needles
-- The caller is responsible that the needles are lower case in case the IgnoreCase
-- is used for case sensitivity
build :: CaseSensitivity -> [Text] -> Searcher ()
build :: CaseSensitivity -> [Text] -> Searcher ()
build CaseSensitivity
case_ = forall v.
Hashable v =>
CaseSensitivity -> [(Text, v)] -> Searcher v
buildWithValues CaseSensitivity
case_ forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (, ())

-- | The caller is responsible that the needles are lower case in case the IgnoreCase
-- is used for case sensitivity
buildWithValues :: Hashable v => CaseSensitivity -> [(Text, v)] -> Searcher v
{-# INLINABLE buildWithValues #-}
buildWithValues :: forall v.
Hashable v =>
CaseSensitivity -> [(Text, v)] -> Searcher v
buildWithValues CaseSensitivity
case_ [(Text, v)]
ns =
  forall v.
CaseSensitivity
-> Hashed [(Text, v)] -> Int -> AcMachine v -> Searcher v
Searcher CaseSensitivity
case_ (forall a. Hashable a => a -> Hashed a
hashed [(Text, v)]
ns) (forall (t :: * -> *) a. Foldable t => t a -> Int
length [(Text, v)]
ns) forall a b. (a -> b) -> a -> b
$ forall v. [(Text, v)] -> AcMachine v
Aho.build [(Text, v)]
ns

-- | Modify the values associated with the needles.
mapSearcher :: Hashable b => (a -> b) -> Searcher a -> Searcher b
mapSearcher :: forall b a. Hashable b => (a -> b) -> Searcher a -> Searcher b
mapSearcher a -> b
f Searcher a
searcher = Searcher a
searcher
  { searcherNeedles :: Hashed [(Text, b)]
searcherNeedles = forall a. Hashable a => a -> Hashed a
hashed forall a b. (a -> b) -> a -> b
$ forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (forall (p :: * -> * -> *) b c a.
Bifunctor p =>
(b -> c) -> p a b -> p a c
second a -> b
f) forall a b. (a -> b) -> a -> b
$ forall v. Searcher v -> [(Text, v)]
needles Searcher a
searcher
  , searcherAutomaton :: AcMachine b
searcherAutomaton = forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap a -> b
f (forall v. Searcher v -> AcMachine v
searcherAutomaton Searcher a
searcher)
  }

needles :: Searcher v -> [(Text, v)]
needles :: forall v. Searcher v -> [(Text, v)]
needles = forall a. Hashed a -> a
unhashed forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall v. Searcher v -> Hashed [(Text, v)]
searcherNeedles

numNeedles :: Searcher v -> Int
numNeedles :: forall v. Searcher v -> Int
numNeedles = forall v. Searcher v -> Int
searcherNumNeedles

automaton :: Searcher v -> Aho.AcMachine v
automaton :: forall v. Searcher v -> AcMachine v
automaton = forall v. Searcher v -> AcMachine v
searcherAutomaton

caseSensitivity :: Searcher v -> CaseSensitivity
caseSensitivity :: forall v. Searcher v -> CaseSensitivity
caseSensitivity = forall v. Searcher v -> CaseSensitivity
searcherCaseSensitive

-- | Updates the case sensitivity of the searcher. Does not change the
-- capitilization of the needles. The caller should be certain that if IgnoreCase
-- is passed, the needles are already lower case.
setCaseSensitivity :: CaseSensitivity -> Searcher v -> Searcher v
setCaseSensitivity :: forall v. CaseSensitivity -> Searcher v -> Searcher v
setCaseSensitivity CaseSensitivity
case_ Searcher v
searcher = Searcher v
searcher{
    searcherCaseSensitive :: CaseSensitivity
searcherCaseSensitive = CaseSensitivity
case_
  }

-- | Return whether the haystack contains any of the needles.
-- Case sensitivity depends on the properties of the searcher
-- This function is marked noinline as an inlining boundary. Aho.runText is
-- marked inline, so this function will be optimized to report only whether
-- there is a match, and not construct a list of matches. We don't want this
-- function be inline, to make sure that the conditions of the caller don't
-- affect how this function is optimized. There is little to gain from
-- additional inlining. The pragma is not an optimization in itself, rather it
-- is a defence against fragile optimizer decisions.
{-# NOINLINE containsAny #-}
containsAny :: Searcher () -> Text -> Bool
containsAny :: Searcher () -> Text -> Bool
containsAny !Searcher ()
searcher !Text
text =
  let
    -- On the first match, return True immediately.
    f :: p -> p -> Next Bool
f p
_acc p
_match = forall a. a -> Next a
Aho.Done Bool
True
  in case forall v. Searcher v -> CaseSensitivity
caseSensitivity Searcher ()
searcher of
    CaseSensitivity
CaseSensitive  -> forall a v.
a -> (a -> Match v -> Next a) -> AcMachine v -> Text -> a
Aho.runText Bool
False forall {p} {p}. p -> p -> Next Bool
f (forall v. Searcher v -> AcMachine v
automaton Searcher ()
searcher) Text
text
    CaseSensitivity
IgnoreCase      -> forall a v.
a -> (a -> Match v -> Next a) -> AcMachine v -> Text -> a
Aho.runLower Bool
False forall {p} {p}. p -> p -> Next Bool
f (forall v. Searcher v -> AcMachine v
automaton Searcher ()
searcher) Text
text

-- | Build a 'Searcher' that returns the needle's index in the needle list when it matches.
buildNeedleIdSearcher :: CaseSensitivity -> [Text] -> Searcher Int
buildNeedleIdSearcher :: CaseSensitivity -> [Text] -> Searcher Int
buildNeedleIdSearcher !CaseSensitivity
case_ ![Text]
ns =
  forall v.
Hashable v =>
CaseSensitivity -> [(Text, v)] -> Searcher v
buildWithValues CaseSensitivity
case_ forall a b. (a -> b) -> a -> b
$ forall a b. [a] -> [b] -> [(a, b)]
zip [Text]
ns [Int
0..]

-- | Returns whether the haystack contains all of the needles.
-- This function expects the passed 'Searcher' to be constructed using 'buildNeedleIdAutomaton'.
containsAll :: Searcher Int -> Text -> Bool
containsAll :: Searcher Int -> Text -> Bool
containsAll !Searcher Int
searcher !Text
haystack =
  let
    initial :: IntSet
initial = [Int] -> IntSet
IS.fromDistinctAscList [Int
0..forall v. Searcher v -> Int
numNeedles Searcher Int
searcher forall a. Num a => a -> a -> a
- Int
1]
    ac :: AcMachine Int
ac = forall v. Searcher v -> AcMachine v
automaton Searcher Int
searcher

    f :: IntSet -> Match Int -> Next IntSet
f !IntSet
acc (Aho.Match CodeUnitIndex
_index !Int
needleId)
      | IntSet -> Bool
IS.null IntSet
acc' = forall a. a -> Next a
Aho.Done IntSet
acc'
      | Bool
otherwise = forall a. a -> Next a
Aho.Step IntSet
acc'
      where
        !acc' :: IntSet
acc' = Int -> IntSet -> IntSet
IS.delete Int
needleId IntSet
acc

  in IntSet -> Bool
IS.null forall a b. (a -> b) -> a -> b
$ case forall v. Searcher v -> CaseSensitivity
caseSensitivity Searcher Int
searcher of
    CaseSensitivity
CaseSensitive -> forall a v.
a -> (a -> Match v -> Next a) -> AcMachine v -> Text -> a
Aho.runText IntSet
initial IntSet -> Match Int -> Next IntSet
f AcMachine Int
ac Text
haystack
    CaseSensitivity
IgnoreCase -> forall a v.
a -> (a -> Match v -> Next a) -> AcMachine v -> Text -> a
Aho.runLower IntSet
initial IntSet -> Match Int -> Next IntSet
f AcMachine Int
ac Text
haystack