-- Alfred-Margaret: Fast Aho-Corasick string searching
-- Copyright 2019 Channable
--
-- Licensed under the 3-clause BSD license, see the LICENSE file in the
-- repository root.

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}

-- | Splitting strings using Aho–Corasick.
module Data.Text.AhoCorasick.Splitter
    ( Splitter
    , automaton
    , build
    , separator
    , split
    , splitIgnoreCase
    , splitReverse
    , splitReverseIgnoreCase
    ) where

import Control.DeepSeq (NFData (..))
import Data.Function (on)
import Data.Hashable (Hashable (..))
import Data.List.NonEmpty (NonEmpty ((:|)))
import Data.Text.Utf8 (Text)

#if defined(HAS_AESON)
import qualified Data.Aeson as AE
#endif

import qualified Data.List.NonEmpty as NonEmpty
import qualified Data.Text as Text

import Data.Text.AhoCorasick.Automaton (AcMachine)

import qualified Data.Text.Utf8 as Utf8
import qualified Data.Text.AhoCorasick.Automaton as Aho

--------------------------------------------------------------------------------
-- Splitter

-- | Build a splitter once, then use it many times!
data Splitter =
  Splitter
    { Splitter -> AcMachine ()
splitterAutomaton :: AcMachine () -- INVARIANT: Exactly one needle.
    , Splitter -> Text
splitterSeparator :: Text         -- INVARIANT: Equivalent to needle.
    }

#if defined(HAS_AESON)
instance AE.ToJSON Splitter where
  toJSON :: Splitter -> Value
toJSON = forall a. ToJSON a => a -> Value
AE.toJSON forall b c a. (b -> c) -> (a -> b) -> a -> c
. Splitter -> Text
separator

instance AE.FromJSON Splitter where
  parseJSON :: Value -> Parser Splitter
parseJSON Value
v = Text -> Splitter
build forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall a. FromJSON a => Value -> Parser a
AE.parseJSON Value
v
#endif

-- | Construct a splitter with a single separator.
{-# INLINE build #-}
build :: Text -> Splitter
build :: Text -> Splitter
build Text
sep =
  let !auto :: AcMachine ()
auto = forall v. [(Text, v)] -> AcMachine v
Aho.build [(Text
sep, ())] in
  AcMachine () -> Text -> Splitter
Splitter AcMachine ()
auto Text
sep

-- | Get the automaton that would be used for finding separators.
{-# INLINE automaton #-}
automaton :: Splitter -> AcMachine ()
automaton :: Splitter -> AcMachine ()
automaton = Splitter -> AcMachine ()
splitterAutomaton

-- | What is the separator we are splitting on?
{-# INLINE separator #-}
separator :: Splitter -> Text
separator :: Splitter -> Text
separator = Splitter -> Text
splitterSeparator

-- | Split the given string into strings separated by the separator.
--
-- If the order of the results is not important, use the faster function
-- 'splitReverse' instead.
{-# INLINE split #-}
split :: Splitter -> Text -> NonEmpty Text
split :: Splitter -> Text -> NonEmpty Text
split = (forall a. NonEmpty a -> NonEmpty a
NonEmpty.reverse forall b c a. (b -> c) -> (a -> b) -> a -> c
.) forall b c a. (b -> c) -> (a -> b) -> a -> c
. Splitter -> Text -> NonEmpty Text
splitReverse

-- | Split the given string into strings separated by the separator.
--
-- If the order of the results is not important, use the faster function
-- 'splitReverseIgnoreCase' instead.
--
-- The separator is matched case-insensitively, but the splitter must have been
-- constructed with a lowercase needle.
{-# INLINE splitIgnoreCase #-}
splitIgnoreCase :: Splitter -> Text -> NonEmpty Text
splitIgnoreCase :: Splitter -> Text -> NonEmpty Text
splitIgnoreCase = (forall a. NonEmpty a -> NonEmpty a
NonEmpty.reverse forall b c a. (b -> c) -> (a -> b) -> a -> c
.) forall b c a. (b -> c) -> (a -> b) -> a -> c
. Splitter -> Text -> NonEmpty Text
splitReverseIgnoreCase

-- | Like 'split', but return the substrings in reverse order.
{-# INLINE splitReverse #-}
splitReverse :: Splitter -> Text -> NonEmpty Text
splitReverse :: Splitter -> Text -> NonEmpty Text
splitReverse Splitter
s Text
t =
  Text -> Accum -> NonEmpty Text
finalizeAccum Text
t forall a b. (a -> b) -> a -> b
$ forall a v.
a -> (a -> Match v -> Next a) -> AcMachine v -> Text -> a
Aho.runText Accum
zeroAccum forall {v}. Accum -> Match v -> Next Accum
stepAccum' (Splitter -> AcMachine ()
automaton Splitter
s) Text
t
  where
    -- Case sensitive matching: separator length is in bytes.
    sepLength :: CodeUnitIndex
sepLength = Text -> CodeUnitIndex
Utf8.lengthUtf8 (Splitter -> Text
separator Splitter
s)
    stepAccum' :: Accum -> Match v -> Next Accum
stepAccum' Accum
accum (Aho.Match CodeUnitIndex
newFragmentStart v
_) =
      Text -> Accum -> CodeUnitIndex -> CodeUnitIndex -> Next Accum
stepAccum Text
t Accum
accum (CodeUnitIndex
newFragmentStart forall a. Num a => a -> a -> a
- CodeUnitIndex
sepLength) CodeUnitIndex
newFragmentStart


-- | Like 'splitIgnoreCase', but return the substrings in reverse order.
{-# INLINE splitReverseIgnoreCase #-}
splitReverseIgnoreCase :: Splitter -> Text -> NonEmpty Text
splitReverseIgnoreCase :: Splitter -> Text -> NonEmpty Text
splitReverseIgnoreCase Splitter
s Text
t =
  Text -> Accum -> NonEmpty Text
finalizeAccum Text
t forall a b. (a -> b) -> a -> b
$ forall a v.
a -> (a -> Match v -> Next a) -> AcMachine v -> Text -> a
Aho.runLower Accum
zeroAccum forall {v}. Accum -> Match v -> Next Accum
stepAccum' (Splitter -> AcMachine ()
automaton Splitter
s) Text
t
  where
    -- Case insensitive matching: separator length is in codepoints.
    sepLength :: Int
sepLength = Text -> Int
Text.length (Splitter -> Text
separator Splitter
s)
    stepAccum' :: Accum -> Match v -> Next Accum
stepAccum' Accum
accum (Aho.Match CodeUnitIndex
newFragmentStart v
_) =
      -- We start at the last byte of the separator, and look backwards.
      let sepStart :: CodeUnitIndex
sepStart = Text -> CodeUnitIndex -> Int -> CodeUnitIndex
Utf8.skipCodePointsBackwards Text
t (CodeUnitIndex
newFragmentStartforall a. Num a => a -> a -> a
-CodeUnitIndex
1) (Int
sepLengthforall a. Num a => a -> a -> a
-Int
1) in
      Text -> Accum -> CodeUnitIndex -> CodeUnitIndex -> Next Accum
stepAccum Text
t Accum
accum CodeUnitIndex
sepStart CodeUnitIndex
newFragmentStart

--------------------------------------------------------------------------------
-- Fold

-- | The accumulator is used as state when processing the matches from left to
-- right. While the matches are fed to us ordered by end offset, all matches
-- have the same length because there is only one needle.
data Accum =
  Accum
    { Accum -> [Text]
accumResult :: ![Text]
      -- ^ Match-separated strings.
    , Accum -> CodeUnitIndex
accumFragmentStart :: !Aho.CodeUnitIndex
      -- ^ First byte of current fragment (that is the non-separator part)
    }

-- | Finalizing the accumulator does more than just 'accumResult', hence this
-- is a separate function.
{-# INLINE finalizeAccum #-}
finalizeAccum :: Text -> Accum -> NonEmpty Text
finalizeAccum :: Text -> Accum -> NonEmpty Text
finalizeAccum Text
hay (Accum [Text]
res CodeUnitIndex
prevEnd) =
  -- Once we have processed all the matches, there is still the substring after
  -- the final match. This substring is always included in the result, even
  -- when there were no matches. Hence we can return a non-empty list.
  let !str :: Text
str = CodeUnitIndex -> CodeUnitIndex -> Text -> Text
Utf8.unsafeSliceUtf8 CodeUnitIndex
prevEnd (Text -> CodeUnitIndex
Utf8.lengthUtf8 Text
hay forall a. Num a => a -> a -> a
- CodeUnitIndex
prevEnd) Text
hay in
  Text
str forall a. a -> [a] -> NonEmpty a
:| [Text]
res

-- | The initial accumulator begins at the begin of the haystack.
{-# INLINE zeroAccum #-}
zeroAccum :: Accum
zeroAccum :: Accum
zeroAccum = Accum { accumResult :: [Text]
accumResult = [], accumFragmentStart :: CodeUnitIndex
accumFragmentStart = CodeUnitIndex
0 }

-- | Step the accumulator using the next match. Overlapping matches will be
-- ignored. Overlapping matches may occur when the separator has a non-empty
-- prefix that is also a suffix.
{-# INLINE stepAccum #-}
stepAccum :: Text -> Accum -> Aho.CodeUnitIndex -> Aho.CodeUnitIndex -> Aho.Next Accum
stepAccum :: Text -> Accum -> CodeUnitIndex -> CodeUnitIndex -> Next Accum
stepAccum Text
hay acc :: Accum
acc@(Accum [Text]
res CodeUnitIndex
fragmentStart) CodeUnitIndex
sepStart CodeUnitIndex
newFragmentStart

  -- When the match begins before the current offset, it overlaps a match that
  -- we processed before, and so we ignore it.
  | CodeUnitIndex
sepStart forall a. Ord a => a -> a -> Bool
< CodeUnitIndex
fragmentStart =
      forall a. a -> Next a
Aho.Step Accum
acc

  -- The match is behind the current offset, so we slice the haystack until the
  -- begin of the match and include that as a result.
  | Bool
otherwise =
      let !str :: Text
str = CodeUnitIndex -> CodeUnitIndex -> Text -> Text
Utf8.unsafeSliceUtf8 CodeUnitIndex
fragmentStart (CodeUnitIndex
sepStart forall a. Num a => a -> a -> a
- CodeUnitIndex
fragmentStart) Text
hay in
      forall a. a -> Next a
Aho.Step Accum
acc { accumResult :: [Text]
accumResult = Text
str forall a. a -> [a] -> [a]
: [Text]
res, accumFragmentStart :: CodeUnitIndex
accumFragmentStart = CodeUnitIndex
newFragmentStart }

--------------------------------------------------------------------------------
-- Instances

instance Eq Splitter where
  {-# INLINE (==) #-}
  == :: Splitter -> Splitter -> Bool
(==) = forall a. Eq a => a -> a -> Bool
(==) forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` Splitter -> Text
separator

instance Ord Splitter where
  {-# INLINE compare #-}
  compare :: Splitter -> Splitter -> Ordering
compare = forall a. Ord a => a -> a -> Ordering
compare forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` Splitter -> Text
separator

instance Hashable Splitter where
  {-# INLINE hashWithSalt #-}
  hashWithSalt :: Int -> Splitter -> Int
hashWithSalt Int
salt Splitter
searcher =
    Int
salt forall a. Hashable a => Int -> a -> Int
`hashWithSalt` Splitter -> Text
separator Splitter
searcher

instance NFData Splitter where
  {-# INLINE rnf #-}
  rnf :: Splitter -> ()
rnf (Splitter AcMachine ()
searcher Text
sepLength) =
    forall a. NFData a => a -> ()
rnf AcMachine ()
searcher seq :: forall a b. a -> b -> b
`seq`
    forall a. NFData a => a -> ()
rnf Text
sepLength

instance Show Splitter where
  showsPrec :: Int -> Splitter -> ShowS
showsPrec Int
p Splitter
splitter =
    Bool -> ShowS -> ShowS
showParen (Int
p forall a. Ord a => a -> a -> Bool
> Int
10) forall a b. (a -> b) -> a -> b
$
      String -> ShowS
showString String
"build " forall b c a. (b -> c) -> (a -> b) -> a -> c
.
        forall a. Show a => Int -> a -> ShowS
showsPrec Int
11 (Splitter -> Text
separator Splitter
splitter)