-- Alfred-Margaret: Fast Aho-Corasick string searching
-- Copyright 2019 Channable
--
-- Licensed under the 3-clause BSD license, see the LICENSE file in the
-- repository root.

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}

-- | Splitting strings using Aho–Corasick.
module Data.Text.AhoCorasick.Splitter
    ( Splitter
    , automaton
    , build
    , separator
    , split
    , splitIgnoreCase
    , splitReverse
    , splitReverseIgnoreCase
    ) where

import Control.DeepSeq (NFData (..))
import Data.Function (on)
import Data.Hashable (Hashable (..))
import Data.List.NonEmpty (NonEmpty ((:|)))
import Data.Text.Utf8 (Text)

#if defined(HAS_AESON)
import qualified Data.Aeson as AE
#endif

import qualified Data.List.NonEmpty as NonEmpty

import Data.Text.AhoCorasick.Automaton (AcMachine)

import qualified Data.Text.Utf8 as Utf8
import qualified Data.Text.AhoCorasick.Automaton as Aho

--------------------------------------------------------------------------------
-- Splitter

-- | Build a splitter once, then use it many times!
data Splitter =
  Splitter
    { Splitter -> AcMachine ()
splitterAutomaton :: AcMachine () -- INVARIANT: Exactly one needle.
    , Splitter -> Text
splitterSeparator :: Text         -- INVARIANT: Equivalent to needle.
    }

#if defined(HAS_AESON)
instance AE.ToJSON Splitter where
  toJSON :: Splitter -> Value
toJSON = Text -> Value
forall a. ToJSON a => a -> Value
AE.toJSON (Text -> Value) -> (Splitter -> Text) -> Splitter -> Value
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Splitter -> Text
separator

instance AE.FromJSON Splitter where
  parseJSON :: Value -> Parser Splitter
parseJSON Value
v = Text -> Splitter
build (Text -> Splitter) -> Parser Text -> Parser Splitter
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Value -> Parser Text
forall a. FromJSON a => Value -> Parser a
AE.parseJSON Value
v
#endif

-- | Construct a splitter with a single separator.
{-# INLINE build #-}
build :: Text -> Splitter
build :: Text -> Splitter
build Text
sep =
  let !auto :: AcMachine ()
auto = [(Text, ())] -> AcMachine ()
forall v. [(Text, v)] -> AcMachine v
Aho.build [(Text
sep, ())] in
  AcMachine () -> Text -> Splitter
Splitter AcMachine ()
auto Text
sep

-- | Get the automaton that would be used for finding separators.
{-# INLINE automaton #-}
automaton :: Splitter -> AcMachine ()
automaton :: Splitter -> AcMachine ()
automaton = Splitter -> AcMachine ()
splitterAutomaton

-- | What is the separator we are splitting on?
{-# INLINE separator #-}
separator :: Splitter -> Text
separator :: Splitter -> Text
separator = Splitter -> Text
splitterSeparator

-- | Split the given string into strings separated by the separator.
--
-- If the order of the results is not important, use the faster function
-- 'splitReverse' instead.
{-# INLINE split #-}
split :: Splitter -> Text -> NonEmpty Text
split :: Splitter -> Text -> NonEmpty Text
split = (NonEmpty Text -> NonEmpty Text
forall a. NonEmpty a -> NonEmpty a
NonEmpty.reverse (NonEmpty Text -> NonEmpty Text)
-> (Text -> NonEmpty Text) -> Text -> NonEmpty Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
.) ((Text -> NonEmpty Text) -> Text -> NonEmpty Text)
-> (Splitter -> Text -> NonEmpty Text)
-> Splitter
-> Text
-> NonEmpty Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Splitter -> Text -> NonEmpty Text
splitReverse

-- | Split the given string into strings separated by the separator.
--
-- If the order of the results is not important, use the faster function
-- 'splitReverseIgnoreCase' instead.
--
-- The separator is matched case-insensitively, but the splitter must have been
-- constructed with a lowercase needle.
{-# INLINE splitIgnoreCase #-}
splitIgnoreCase :: Splitter -> Text -> NonEmpty Text
splitIgnoreCase :: Splitter -> Text -> NonEmpty Text
splitIgnoreCase = (NonEmpty Text -> NonEmpty Text
forall a. NonEmpty a -> NonEmpty a
NonEmpty.reverse (NonEmpty Text -> NonEmpty Text)
-> (Text -> NonEmpty Text) -> Text -> NonEmpty Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
.) ((Text -> NonEmpty Text) -> Text -> NonEmpty Text)
-> (Splitter -> Text -> NonEmpty Text)
-> Splitter
-> Text
-> NonEmpty Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Splitter -> Text -> NonEmpty Text
splitReverseIgnoreCase

-- | Like 'split', but return the substrings in reverse order.
{-# INLINE splitReverse #-}
splitReverse :: Splitter -> Text -> NonEmpty Text
splitReverse :: Splitter -> Text -> NonEmpty Text
splitReverse Splitter
s Text
t =
  Accum -> NonEmpty Text
finalizeAccum (Accum -> NonEmpty Text) -> Accum -> NonEmpty Text
forall a b. (a -> b) -> a -> b
$
    Accum
-> (Accum -> Match () -> Next Accum)
-> AcMachine ()
-> Text
-> Accum
forall a v.
a -> (a -> Match v -> Next a) -> AcMachine v -> Text -> a
Aho.runText
      (Text -> Text -> Accum
zeroAccum (Splitter -> Text
separator Splitter
s) Text
t)
      Accum -> Match () -> Next Accum
forall v. Accum -> Match v -> Next Accum
stepAccum
      (Splitter -> AcMachine ()
automaton Splitter
s)
      Text
t

-- | Like 'splitIgnoreCase', but return the substrings in reverse order.
{-# INLINE splitReverseIgnoreCase #-}
splitReverseIgnoreCase :: Splitter -> Text -> NonEmpty Text
splitReverseIgnoreCase :: Splitter -> Text -> NonEmpty Text
splitReverseIgnoreCase Splitter
s Text
t =
  Accum -> NonEmpty Text
finalizeAccum (Accum -> NonEmpty Text) -> Accum -> NonEmpty Text
forall a b. (a -> b) -> a -> b
$
    Accum
-> (Accum -> Match () -> Next Accum)
-> AcMachine ()
-> Text
-> Accum
forall a v.
a -> (a -> Match v -> Next a) -> AcMachine v -> Text -> a
Aho.runLower
      (Text -> Text -> Accum
zeroAccum (Splitter -> Text
separator Splitter
s) Text
t)
      Accum -> Match () -> Next Accum
forall v. Accum -> Match v -> Next Accum
stepAccum
      (Splitter -> AcMachine ()
automaton Splitter
s)
      Text
t

--------------------------------------------------------------------------------
-- Fold

-- | The accumulator is used as state when processing the matches from left to
-- right. While the matches are fed to us ordered by end offset, all matches
-- have the same length because there is only one needle.
data Accum =
  Accum
    { Accum -> CodeUnitIndex
_accumSepLen   :: !Aho.CodeUnitIndex -- ^ Length of separator.
    , Accum -> Text
_accumHaystack :: !Text              -- ^ Haystack to slice off of.
    , Accum -> [Text]
accumResult    :: ![Text]            -- ^ Match-separated strings.
    , Accum -> CodeUnitIndex
accumPrevEnd   :: !Aho.CodeUnitIndex -- ^ Offset at end of last match.
    }

-- | Finalizing the accumulator does more than just 'accumResult', hence this
-- is a separate function.
{-# INLINE finalizeAccum #-}
finalizeAccum :: Accum -> NonEmpty Text
finalizeAccum :: Accum -> NonEmpty Text
finalizeAccum (Accum CodeUnitIndex
_ Text
hay [Text]
res CodeUnitIndex
prevEnd) =
  -- Once we have processed all the matches, there is still the substring after
  -- the final match. This substring is always included in the result, even
  -- when there were no matches. Hence we can return a non-empty list.
  let !str :: Text
str = CodeUnitIndex -> CodeUnitIndex -> Text -> Text
Utf8.unsafeSliceUtf8 CodeUnitIndex
prevEnd (Text -> CodeUnitIndex
Utf8.lengthUtf8 Text
hay CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
prevEnd) Text
hay in
  Text
str Text -> [Text] -> NonEmpty Text
forall a. a -> [a] -> NonEmpty a
:| [Text]
res

-- | The initial accumulator begins at the begin of the haystack.
{-# INLINE zeroAccum #-}
zeroAccum :: Text -> Text -> Accum
zeroAccum :: Text -> Text -> Accum
zeroAccum Text
sep Text
hay = CodeUnitIndex -> Text -> [Text] -> CodeUnitIndex -> Accum
Accum (Text -> CodeUnitIndex
Utf8.lengthUtf8 Text
sep) Text
hay [] CodeUnitIndex
0

-- | Step the accumulator using the next match. Overlapping matches will be
-- ignored. Overlapping matches may occur when the separator has a non-empty
-- prefix that is also a suffix.
{-# INLINE stepAccum #-}
stepAccum :: Accum -> Aho.Match v -> Aho.Next Accum
stepAccum :: Accum -> Match v -> Next Accum
stepAccum acc :: Accum
acc@(Accum CodeUnitIndex
sepLen Text
hay [Text]
res CodeUnitIndex
prevEnd) (Aho.Match CodeUnitIndex
sepEnd v
_)

  -- When the match begins before the current offset, it overlaps a match that
  -- we processed before, and so we ignore it.
  | CodeUnitIndex
sepEnd CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
sepLen CodeUnitIndex -> CodeUnitIndex -> Bool
forall a. Ord a => a -> a -> Bool
< CodeUnitIndex
prevEnd =
      Accum -> Next Accum
forall a. a -> Next a
Aho.Step Accum
acc

  -- The match is behind the current offset, so we slice the haystack until the
  -- begin of the match and include that as a result.
  | Bool
otherwise =
      let !str :: Text
str = CodeUnitIndex -> CodeUnitIndex -> Text -> Text
Utf8.unsafeSliceUtf8 CodeUnitIndex
prevEnd (CodeUnitIndex
sepEnd CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
sepLen CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
prevEnd) Text
hay in
      Accum -> Next Accum
forall a. a -> Next a
Aho.Step Accum
acc { accumResult :: [Text]
accumResult = Text
str Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
res, accumPrevEnd :: CodeUnitIndex
accumPrevEnd = CodeUnitIndex
sepEnd }

--------------------------------------------------------------------------------
-- Instances

instance Eq Splitter where
  {-# INLINE (==) #-}
  == :: Splitter -> Splitter -> Bool
(==) = Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
(==) (Text -> Text -> Bool)
-> (Splitter -> Text) -> Splitter -> Splitter -> Bool
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` Splitter -> Text
separator

instance Ord Splitter where
  {-# INLINE compare #-}
  compare :: Splitter -> Splitter -> Ordering
compare = Text -> Text -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (Text -> Text -> Ordering)
-> (Splitter -> Text) -> Splitter -> Splitter -> Ordering
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` Splitter -> Text
separator

instance Hashable Splitter where
  {-# INLINE hashWithSalt #-}
  hashWithSalt :: Int -> Splitter -> Int
hashWithSalt Int
salt Splitter
searcher =
    Int
salt Int -> Text -> Int
forall a. Hashable a => Int -> a -> Int
`hashWithSalt` Splitter -> Text
separator Splitter
searcher

instance NFData Splitter where
  {-# INLINE rnf #-}
  rnf :: Splitter -> ()
rnf (Splitter AcMachine ()
searcher Text
sepLength) =
    AcMachine () -> ()
forall a. NFData a => a -> ()
rnf AcMachine ()
searcher () -> () -> ()
`seq`
    Text -> ()
forall a. NFData a => a -> ()
rnf Text
sepLength

instance Show Splitter where
  showsPrec :: Int -> Splitter -> ShowS
showsPrec Int
p Splitter
splitter =
    Bool -> ShowS -> ShowS
showParen (Int
p Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
10) (ShowS -> ShowS) -> ShowS -> ShowS
forall a b. (a -> b) -> a -> b
$
      String -> ShowS
showString String
"build " ShowS -> ShowS -> ShowS
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
        Int -> Text -> ShowS
forall a. Show a => Int -> a -> ShowS
showsPrec Int
11 (Splitter -> Text
separator Splitter
splitter)