-- Alfred-Margaret: Fast Aho-Corasick string searching -- Copyright 2019 Channable -- -- Licensed under the 3-clause BSD license, see the LICENSE file in the -- repository root. {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ScopedTypeVariables #-} module Data.Text.AhoCorasickSpec (spec) where import Control.DeepSeq (rnf) import Control.Monad (forM_, unless) import Data.Foldable (foldl') import Data.List.NonEmpty (NonEmpty ((:|))) import Data.Word (Word16) import Test.Hspec (Spec, Expectation, describe, it, parallel, shouldBe) import Test.Hspec.Expectations (shouldMatchList, shouldSatisfy) import Test.Hspec.QuickCheck (modifyMaxSuccess, modifyMaxSize, prop) import Test.QuickCheck (Arbitrary (arbitrary, shrink), forAll, forAllShrink, (==>)) import Test.QuickCheck.Instances () import Test.QuickCheck.Gen (Gen) import GHC.Stack (HasCallStack) import Data.Text (Text) import Prelude hiding (replicate) import qualified Data.Char as Char import qualified Data.Text as Text import qualified Data.Text.Internal.Search as TextSearch import qualified Data.Text.Unsafe as TextUnsafe import qualified Test.QuickCheck as QuickCheck import qualified Test.QuickCheck.Gen as Gen import Data.Text.AhoCorasick.Automaton (CaseSensitivity (..)) import Data.Text.Orphans () import qualified Data.Text.AhoCorasick.Automaton as Aho import qualified Data.Text.AhoCorasick.Replacer as Replacer import qualified Data.Text.AhoCorasick.Splitter as Splitter import qualified Data.Text.Utf16 as Utf16 -- | Test that for a single needle which equals the haystack, we find a single -- match. Does not apply to the empty needle. needleIsHaystackMatches :: HasCallStack => Text -> Expectation needleIsHaystackMatches needle = let needleUtf16 = Utf16.unpackUtf16 needle len = Utf16.lengthUtf16 needle prependMatch ms match = Aho.Step (match : ms) matches = Aho.runText [] prependMatch (Aho.build [(needleUtf16, ())]) needle in matches `shouldBe` [Aho.Match len ()] ahoMatch :: [(Text, a)] -> Text -> [Aho.Match a] ahoMatch needles haystack = let makeNeedle (text, value) = (Utf16.unpackUtf16 text, value) needlesUtf16 = fmap makeNeedle needles prependMatch matches match = Aho.Step (match : matches) in Aho.runText [] prependMatch (Aho.build needlesUtf16) haystack -- | Match without a payload, return only the match positions. matchPositions :: [Text] -> Text -> [Int] matchPositions needles haystack = let withUnit x = (x, ()) matches = ahoMatch (fmap withUnit needles) haystack in fmap (Utf16.codeUnitIndex . Aho.matchPos) matches -- | `matchPositions` implemented naively in terms of Text's functionality, -- which we assume to be correct. naiveMatchPositions :: [Text] -> Text -> [Int] naiveMatchPositions needles haystack = let prependMatch :: [Int] -> Text -> Int -> Text -> [Int] prependMatch matches needle offset haystackSlice = if Text.null haystack then matches -- Text.indices returns all non-overlapping occurrences of the needle, -- but we want the overlapping ones as well. So we only consider the -- first match, and then search again starting from one past the -- beginning of the match. else case TextSearch.indices needle haystackSlice of [] -> matches i:_ -> prependMatch (match : matches) needle offset' remainingHaystack where -- The match index is the index past the end, not the start index. match = offset + i + TextUnsafe.lengthWord16 needle offset' = offset + i + 1 remainingHaystack = TextUnsafe.dropWord16 (i + 1) haystackSlice prependMatches matches needle = prependMatch matches needle 0 haystack in foldl' prependMatches [] needles -- | Generate random needles and haystacks, such that the needles have a -- reasonable probability of occuring in the haystack, which would hardly be the -- case if we just generated random texts for all of them. We do this by first -- generating a set of fragments, and then building the haystack and needles by -- combining these fragments. By doing this, we also get a lot of partial -- matches, where part of a needle does occur in the haystack, but the full -- needle does not, and also needles with a shared prefix or suffix. This should -- fully stress the possible transitions in the automaton. arbitraryNeedlesHaystack :: Gen ([Text], Text) arbitraryNeedlesHaystack = do let -- Prefer ascii just to have printable test cases, but do include the other -- generator to cover the entire range of code points. genChar = Gen.frequency [ (4, QuickCheck.arbitraryASCIIChar) , (1, QuickCheck.arbitrary) ] genNonEmptyText = do chars <- Gen.listOf1 genChar pure $ Text.pack chars fragments <- Gen.listOf1 $ Gen.resize 5 genNonEmptyText let genFragment = Gen.elements $ filter (not . Text.null) fragments genSmall = Gen.scale (`div` 3) $ Gen.listOf1 genFragment genBig = Gen.scale (* 4) $ Gen.listOf1 genFragment needles <- Gen.listOf1 (fmap Text.concat genSmall) haystack <- fmap Text.concat genBig pure (needles, haystack) spec :: Spec spec = parallel $ do modifyMaxSuccess (const 200) $ do describe "build" $ do prop "does not throw exceptions" $ \ (kv :: [([Word16], Int)]) -> rnf $ Aho.build kv describe "unpackUtf16" $ do it "unpacks code point U+437b8" $ -- Note that 0x437b8 lies in the currently unassigned "Plane 5"; the -- code point does not currently exist, but that should not bother us. -- Check in Python: '\U000437b8'.encode('utf-16be') Utf16.unpackUtf16 "\x000437b8" `shouldBe` [0xd8cd, 0xdfb8] it "unpacks adjacent nulls individually" $ do Utf16.unpackUtf16 "c\NULe" `shouldBe` [99, 0, 101] Utf16.unpackUtf16 "bc\NUL\NULe" `shouldBe` [98, 99, 0, 0, 101] describe "runText" $ do describe "when given a needle equal to the haystack" $ do it "reports a single match for a repeated character" $ forM_ [1..128] $ \n -> needleIsHaystackMatches $ Text.replicate n "a" it "reports a single match for non-BMP data" $ do -- Include a few code points outside of the Basic Multilingual Plane, -- which require multible code units to encode. needleIsHaystackMatches "\x000437b8suffix" needleIsHaystackMatches "aaa\359339aaa\95759aa\899256aa" prop "reports a single match for random needles" $ \needle -> not (Text.null needle) ==> needleIsHaystackMatches needle describe "when given a sliced text (with nonzero internal offset)" $ it "still reports offset relative to the text start" $ -- The match position should be relative to the start of the text "a". -- Even if this text is represented as a slice of "bbba" internally. matchPositions ["a"] (Text.dropWhile (== 'b') "bbba") `shouldMatchList` [1] describe "when given non-ascii inputs" $ do -- We have a special lookup table for transitions from the base state -- for the first 128 code units, which is always hit for ascii inputs. -- Also exercise the fallback code path with a different input. it "reports a match if the first haystack character is > U+7f" $ do matchPositions ["eclair"] "éclair" `shouldMatchList` [] matchPositions ["éclair"] "éclair" `shouldMatchList` [6] matchPositions ["éclair"] "eclair" `shouldMatchList` [] it "reports the correct UTF-16 index for surrogate pairs" $ do -- Note that the index after the match is 2, even though there is -- only a single code point. U+1d11e is encoded as two code units -- in UTF-16. matchPositions ["𝄞"] "𝄞" `shouldMatchList` [2] -- A levitating woman in business suit with dark skin tone needs a -- whopping 5 code points to encode, of which the first two need a -- surrogate pair in UTF-16, for a total of 7 code units. -- U+1f574: man in business suit levitating -- U+1f3ff: emoji modifier Fitzpatrick type-6 -- U+200d: zero width joiner -- U+2640: female sign -- U+fe0f: variation selector-16 -- A peculiar feature of Unicode emoji, is that the male levitating -- man in business suit with dark skin tone is a substring of the -- levitating woman in business suit. And the levitating man in -- business suit without particular skin tone is a substring of that. matchPositions [ "\x1f574\x1f3ff\x200d\x2640\xfe0f" , "\x1f574\x1f3ff" , "\x1f574" ] "\x1f574\x1f3ff\x200d\x2640\xfe0f" `shouldMatchList` [2, 4, 7] describe "when given overlapping needles" $ do it "finds exactly all matches" $ do matchPositions ["foobar", "bar"] "foobar" `shouldMatchList` [6, 6] matchPositions ["foobarbaz", "bar"] "xfoobarbazy" `shouldMatchList` [10, 7] matchPositions ["foobar", "foo"] "xfoobarbazy" `shouldMatchList` [7, 4] it "keeps the value associated with a needle" $ do fmap Aho.matchValue (ahoMatch [("foo", 'A'), ("bar", 'B')] "foobar") `shouldMatchList` ['A', 'B'] fmap Aho.matchValue (ahoMatch [("foo", 'A'), ("bar", 'B')] "foobaz") `shouldMatchList` ['A'] fmap Aho.matchValue (ahoMatch [("foo", 'A'), ("bar", 'B')] "foebar") `shouldMatchList` ['B'] it "reports both matches in case of a duplicate needle" $ fmap Aho.matchValue (ahoMatch [("foo", 'A'), ("foo", 'B')] "foobar") `shouldMatchList` ['A', 'B'] it "finds all quadratic matches" $ forM_ ["a", "ab", "abc"] $ \baseText -> forM_ [1..33] $ \n -> let replicate k = Text.replicate k baseText needles = fmap replicate [1..n] matches = matchPositions needles (replicate n) in -- The needle of length 1 matches n times, the needle of length -- 2 matches n - 1 times, ..., the needle of length n matches -- once. length matches `shouldBe` sum [1..n] describe "when given partially overlapping needles" $ do it "finds exactly all matches" $ do matchPositions ["ab", "bcd"] "abccd" `shouldMatchList` [2] matchPositions ["abc","cde"] "abcdde" `shouldMatchList` [3] matchPositions ["c","c\NULe"] "c\NUL\NULe" `shouldMatchList` [1] -- The case below is a regression test; it did fail before; it would -- report a match at position 5 in addition to position 2. matchPositions ["bc","c\NULe"] "bc\NUL\NULe" `shouldMatchList` [2] describe "when given empyt needles" $ do it "does not report a match" $ do matchPositions [""] "" `shouldMatchList` [] matchPositions [""] "foo" `shouldMatchList` [] describe "when given random needles and haystacks" $ do prop "reports only infixes of the haystack" $ QuickCheck.forAllShrink arbitraryNeedlesHaystack shrink $ \ (needles, haystack) -> let dup x = (x, x) matches = ahoMatch (fmap dup needles) haystack sliceMatch endPos len = Utf16.unsafeSliceUtf16 (endPos - len) len haystack in -- Discard inputs for which there are no matches, to ensure we get -- enough coverage for the case where there are matches. not (null matches) ==> forM_ matches $ \ (Aho.Match pos needle) -> do needle `shouldSatisfy` (`Text.isInfixOf` haystack) sliceMatch pos (Utf16.lengthUtf16 needle) `shouldBe` needle prop "reports all infixes of the haystack" $ QuickCheck.forAllShrink arbitraryNeedlesHaystack shrink $ \ (needles, haystack) -> matchPositions needles haystack `shouldMatchList` naiveMatchPositions needles haystack let isSurrogate cu = cu >= 0xd800 && cu < 0xe000 describe "Char.toLower" $ do -- We test that Char.toLower maps the BMP onto itself, because this implies -- that changing casing code unit by code unit does not change the number of -- code units, which allows us to implement lowercasing in an optimized -- manner. it "maps the Basic Multilingual Plane onto itself" $ forM_ [0 .. maxBound :: Utf16.CodeUnit] $ \cu -> unless (isSurrogate cu) $ let lower = Char.ord $ Char.toLower $ Char.chr $ fromIntegral cu in lower `shouldSatisfy` not . isSurrogate describe "Utf16.lowerCodeUnit" $ it "is equivalent to Char.toLower on the BMP" $ forM_ [0 .. maxBound :: Utf16.CodeUnit] $ \cu -> unless (isSurrogate cu) $ let lowerAsChar = fromIntegral . Char.ord . Char.toLower . Char.chr . fromIntegral in lowerAsChar cu `shouldBe` Utf16.lowerCodeUnit cu describe "Char.toUpper" $ do -- We test that Char.toUpper maps the BMP onto itself, because this implies -- that changing casing code unit by code unit does not change the number of -- code units, which allows us to implement upppercasing in an optimized -- manner. it "maps the Basic Multilingual Plane onto itself" $ forM_ [0 .. maxBound :: Utf16.CodeUnit] $ \cu -> unless (isSurrogate cu) $ let upper = Char.ord $ Char.toUpper $ Char.chr $ fromIntegral cu in upper `shouldSatisfy` not . isSurrogate describe "Utf16.upperCodeUnit" $ it "is equivalent to Char.toUpper on the BMP" $ forM_ [0 .. maxBound :: Utf16.CodeUnit] $ \cu -> unless (isSurrogate cu) $ let upperAsChar = fromIntegral . Char.ord . Char.toUpper . Char.chr . fromIntegral in upperAsChar cu `shouldBe` Utf16.upperCodeUnit cu modifyMaxSize (const 10) $ describe "Replacer.run" $ do let genHaystack = fmap Text.pack $ Gen.listOf $ Gen.frequency [(40, Gen.elements "abAB"), (1, pure 'İ'), (1, arbitrary)] -- needles may not be empty, because empty needles are filtered out in an I.ActionReplaceMultiple genNeedle = fmap Text.pack $ Gen.resize 3 $ Gen.listOf1 $ Gen.elements "abAB" genReplaces = Gen.listOf $ (,) <$> genNeedle <*> arbitrary shrinkReplaces = filter (not . any (\(needle, _) -> Text.null needle)) . shrink replace needles haystack = Replacer.run (Replacer.build CaseSensitive needles) haystack replaceIgnoreCase needles haystack = Replacer.run (Replacer.build IgnoreCase needles) haystack it "replaces all occurrences" $ do replace [("A", "B")] "AXAXB" `shouldBe` "BXBXB" replace [("A", "B"), ("X", "Y")] "AXAXB" `shouldBe` "BYBYB" replace [("aaa", ""), ("b", "c")] "aaabaaa" `shouldBe` "c" -- Have a few non-matching needles too. replace [("A", "B"), ("Q", "r"), ("Z", "")] "AXAXB" `shouldBe` "BXBXB" it "replaces only non-overlapping matches" $ do replace [("aa", "zz"), ("bb", "w")] "aaabbb" `shouldBe` "zzawb" replace [("aaa", "")] "aaaaa" `shouldBe` "aa" it "replaces all occurrences in priority order" $ do replace [("A", ""), ("BBBB", "bingo")] "BBABB" `shouldBe` "bingo" replace [("BB", ""), ("BBBB", "bingo")] "BBBB" `shouldBe` "" it "replaces needles that contain a surrogate pair" $ replace [("\x1f574", "levitating man in business suit")] "the \x1f574" `shouldBe` "the levitating man in business suit" it "replaces all occurrences case-insensitively" $ do replaceIgnoreCase [("A", "B")] "AXAXB" `shouldBe` "BXBXB" replaceIgnoreCase [("A", "B")] "axaxb" `shouldBe` "BxBxb" replaceIgnoreCase [("a", "b")] "AXAXB" `shouldBe` "bXbXB" replaceIgnoreCase [("A", "B"), ("X", "Y")] "AXAXB" `shouldBe` "BYBYB" replaceIgnoreCase [("A", "B"), ("X", "Y")] "axaxb" `shouldBe` "BYBYb" replaceIgnoreCase [("a", "b"), ("x", "y")] "AXAXB" `shouldBe` "bybyB" it "matches replacements case-insensitively" $ replaceIgnoreCase [("foo", "BAR"), ("bar", "BAZ")] "Foo" `shouldBe` "BAZ" it "matches replacements case-insensitively for non-ascii characters" $ do replaceIgnoreCase [("éclair", "lightning")] "Éclair" `shouldBe` "lightning" -- Note: U+0319 is an uppercase alpha, which looks exactly like A, but it -- is a different code point. replaceIgnoreCase [("bèta", "α"), ("\x0391", "alpha")] "BÈTA" `shouldBe` "alpha" it "matches surrogate pairs case-insensitively" $ do -- We can't lowercase a levivating man in business suit, but that should -- not affect whether we match it or not. replaceIgnoreCase [("\x1f574", "levitating man in business suit")] "the \x1f574" `shouldBe` "the levitating man in business suit" prop "satisfies (run . compose a b) == (run b (run a))" $ forAllShrink genHaystack shrink $ \haystack -> forAll arbitrary $ \case_ -> forAllShrink genReplaces shrinkReplaces $ \replaces1 -> forAllShrink genReplaces shrinkReplaces $ \replaces2 -> let rm1 = Replacer.build case_ replaces1 rm2 = Replacer.build case_ replaces2 Just rm12 = Replacer.compose rm1 rm2 in Replacer.run rm2 (Replacer.run rm1 haystack) `shouldBe` Replacer.run rm12 haystack prop "is identity for empty needles" $ \case_ haystack -> let replacerId = Replacer.build case_ [] in Replacer.run replacerId haystack `shouldBe` haystack prop "is equivalent to sequential Text.replace calls" $ forAllShrink genHaystack shrink $ \haystack -> forAllShrink genReplaces shrinkReplaces $ \replaces -> let replacer = Replacer.build CaseSensitive replaces replaceText agg (needle, replacement) = Text.replace needle replacement agg expected = foldl' replaceText haystack replaces in Replacer.run replacer haystack `shouldBe` expected describe "Splitter.split" $ it "passes an example" $ let separator = "bob" in let splitter = Splitter.build separator in Splitter.split splitter "C++bobobCOBOLbobScala" `shouldBe` "C++" :| ["obCOBOL", "Scala"]