-- Alfred-Margaret: Fast Aho-Corasick string searching -- Copyright 2019 Channable -- -- Licensed under the 3-clause BSD license, see the LICENSE file in the -- repository root. {-# LANGUAGE BangPatterns #-} {-# LANGUAGE CPP #-} -- | Splitting strings using Aho–Corasick. module Data.Text.AhoCorasick.Splitter ( Splitter , build , automaton , separator , split , splitIgnoreCase , splitReverse , splitReverseIgnoreCase ) where import Control.DeepSeq (NFData (..)) import Data.Function (on) import Data.Hashable (Hashable (..)) import Data.List.NonEmpty (NonEmpty ((:|))) import Data.Text (Text) #if defined(HAS_AESON) import qualified Data.Aeson as AE #endif import qualified Data.List.NonEmpty as NonEmpty import Data.Text.AhoCorasick.Automaton (AcMachine) import qualified Data.Text.AhoCorasick.Automaton as Aho import qualified Data.Text.Utf16 as Utf16 -------------------------------------------------------------------------------- -- Splitter -- | Build a splitter once, then use it many times! data Splitter = Splitter { splitterAutomaton :: AcMachine () -- INVARIANT: Exactly one needle. , splitterSeparator :: Text -- INVARIANT: Equivalent to needle. } #if defined(HAS_AESON) instance AE.ToJSON Splitter where toJSON = AE.toJSON . separator instance AE.FromJSON Splitter where parseJSON v = build <$> AE.parseJSON v #endif -- | Construct a splitter with a single separator. {-# INLINE build #-} build :: Text -> Splitter build sep = let !auto = Aho.build [(Utf16.unpackUtf16 sep, ())] in Splitter auto sep -- | Get the automaton that would be used for finding separators. {-# INLINE automaton #-} automaton :: Splitter -> AcMachine () automaton = splitterAutomaton -- | What is the separator we are splitting on? {-# INLINE separator #-} separator :: Splitter -> Text separator = splitterSeparator -- | Split the given string into strings separated by the separator. -- -- If the order of the results is not important, use the faster function -- 'splitReverse' instead. {-# INLINE split #-} split :: Splitter -> Text -> NonEmpty Text split = (NonEmpty.reverse .) . splitReverse -- | Split the given string into strings separated by the separator. -- -- If the order of the results is not important, use the faster function -- 'splitReverseIgnoreCase' instead. -- -- The separator is matched case-insensitively, but the splitter must have been -- constructed with a lowercase needle. {-# INLINE splitIgnoreCase #-} splitIgnoreCase :: Splitter -> Text -> NonEmpty Text splitIgnoreCase = (NonEmpty.reverse .) . splitReverseIgnoreCase -- | Like 'split', but return the substrings in reverse order. {-# INLINE splitReverse #-} splitReverse :: Splitter -> Text -> NonEmpty Text splitReverse s t = finalizeAccum $ Aho.runText (zeroAccum (separator s) t) stepAccum (automaton s) t -- | Like 'splitIgnoreCase', but return the substrings in reverse order. {-# INLINE splitReverseIgnoreCase #-} splitReverseIgnoreCase :: Splitter -> Text -> NonEmpty Text splitReverseIgnoreCase s t = finalizeAccum $ Aho.runLower (zeroAccum (separator s) t) stepAccum (automaton s) t -------------------------------------------------------------------------------- -- Fold -- | The accumulator is used as state when processing the matches from left to -- right. While the matches are fed to us ordered by end offset, all matches -- have the same length because there is only one needle. data Accum = Accum { _accumSepLen :: !Aho.CodeUnitIndex -- ^ Length of separator. , _accumHaystack :: !Text -- ^ Haystack to slice off of. , accumResult :: ![Text] -- ^ Match-separated strings. , accumPrevEnd :: !Aho.CodeUnitIndex -- ^ Offset at end of last match. } -- | Finalizing the accumulator does more than just 'accumResult', hence this -- is a separate function. {-# INLINE finalizeAccum #-} finalizeAccum :: Accum -> NonEmpty Text finalizeAccum (Accum _ hay res prevEnd) = -- Once we have processed all the matches, there is still the substring after -- the final match. This substring is always included in the result, even -- when there were no matches. Hence we can return a non-empty list. let !str = Utf16.unsafeSliceUtf16 prevEnd (Utf16.lengthUtf16 hay - prevEnd) hay in str :| res -- | The initial accumulator begins at the begin of the haystack. {-# INLINE zeroAccum #-} zeroAccum :: Text -> Text -> Accum zeroAccum sep hay = Accum (Utf16.lengthUtf16 sep) hay [] 0 -- | Step the accumulator using the next match. Overlapping matches will be -- ignored. Overlapping matches may occur when the separator has a non-empty -- prefix that is also a suffix. {-# INLINE stepAccum #-} stepAccum :: Accum -> Aho.Match v -> Aho.Next Accum stepAccum acc@(Accum sepLen hay res prevEnd) (Aho.Match sepEnd _) -- When the match begins before the current offset, it overlaps a match that -- we processed before, and so we ignore it. | sepEnd - sepLen < prevEnd = Aho.Step acc -- The match is behind the current offset, so we slice the haystack until the -- begin of the match and include that as a result. | otherwise = let !str = Utf16.unsafeSliceUtf16 prevEnd (sepEnd - sepLen - prevEnd) hay in Aho.Step acc { accumResult = str : res, accumPrevEnd = sepEnd } -------------------------------------------------------------------------------- -- Instances instance Eq Splitter where {-# INLINE (==) #-} (==) = (==) `on` separator instance Ord Splitter where {-# INLINE compare #-} compare = compare `on` separator instance Hashable Splitter where {-# INLINE hashWithSalt #-} hashWithSalt salt searcher = salt `hashWithSalt` separator searcher instance NFData Splitter where {-# INLINE rnf #-} rnf (Splitter searcher sepLength) = rnf searcher `seq` rnf sepLength instance Show Splitter where showsPrec p splitter = showParen (p > 10) $ showString "build " . showsPrec 11 (separator splitter)