{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE TupleSections #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE OverloadedLists #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ViewPatterns #-} {-| Module : FuzzyFind Description : Provides fuzzy matching on text Copyright : Unison Computing, 2021 License : MIT Maintainer : runar.bjarnason@unison.cloud Stability : experimental A package that provides an API for fuzzy text search in Haskell, using a modified version of the Smith-Waterman algorithm. The search is intended to behave similarly to the excellent fzf tool by Junegunn Choi. -} module Text.FuzzyFind where import Control.Monad (join) import Data.Massiv.Array ( Array, (!), Ix2(..), (...), forM, forM_ ) import qualified Data.Massiv.Array as A import qualified Data.Massiv.Array.Unsafe as A import qualified Data.Massiv.Array.Mutable as M import Data.Char (isAlphaNum, isLower, isUpper, toLower) import Data.Foldable (maximumBy, toList, foldl') import Data.Function (on) import Data.Maybe (fromMaybe) import GHC.Generics (Generic) import Data.Text (Text) import qualified Data.Text as Text import Control.Monad.ST (runST) import Data.Sequence ( Seq (..), ViewL (..), ViewR (..), viewl, viewr, (<|) ) import qualified Data.Sequence as Seq -- | @bestMatch query string@ will return 'Nothing' if @query@ is not a -- subsequence of @string@. Otherwise, it will return the "best" way to line up -- the characters in @query@ with the characters in @string@. Lower-case -- characters in the @query@ are assumed to be case-insensitive, and upper-case -- characters are assumed to be case-sensitive. -- -- For example: -- -- @ -- > bestMatch "ff" \"FuzzyFind\" -- Just (Alignment {score = 25, result = Result {[Match \"F\", Gap "uzzy", Match \"F\", Gap "ind"]}}) -- @ -- -- The score indicates how "good" the match is. Better matches have higher -- scores. There's no maximum score (except for the upper limit of the 'Int' -- datatype), but the lowest score is @0@. -- -- A substring from the query will generate a 'Match', and any characters from -- the input that don't result in a 'Match' will generate a 'Gap'. -- Concatenating all the 'Match' and 'Gap' results should yield the original -- input string. -- -- Note that the matched characters in the input always occur in the same order -- as they do in the query pattern. -- -- The algorithm prefers (and will generate higher scores for) the following -- kinds of matches: -- -- 1. Contiguous characters from the query string. For example, @bestMatch "pp"@ -- will find the last two ps in "pickled pepper". -- 2. Characters at the beginnings of words. For example, @bestMatch "pp"@ -- will find the first two Ps in \"Peter Piper\". -- 3. Characters at CamelCase humps. For example, @bestMatch "bm" \"BatMan\"@ -- will score higher than @bestMatch "bm" \"Batman\".@ -- 4. The algorithm strongly prefers the first character of the query pattern -- to be at the beginning of a word or CamelHump. For example, -- @bestMatch "mn" \"Bat Man\"@ will score higher than @bestMatch "atn" \"Batman\"@. -- -- All else being equal, matches that occur later in the input string are preferred. bestMatch :: String -- ^ The query pattern. -> String -- ^ The input string. -> Maybe Alignment bestMatch = bestMatch' defaultMatchScore defaultMismatchScore defaultGapPenalty defaultBoundaryBonus defaultCamelCaseBonus defaultFirstCharBonusMultiplier defaultConsecutiveBonus -- | Finds input strings that match all the given input patterns. For each input -- that matches, it returns one 'Alignment'. The output is not sorted. -- ascending. -- -- For example: -- -- @ -- > import Data.Foldable -- > traverse_ (putStrLn . ("\\n" ++) . highlight) $ fuzzyFind ["dad", "mac", "dam"] ["red macadamia", "Madam Card"] -- -- Madam Card -- * *** ** * -- -- red macadamia -- * ******* -- @ fuzzyFind :: [String] -- ^ The query patterns. -> [String] -- ^ The input strings. -> [Alignment] fuzzyFind = (fmap fst .) . fuzzyFindOn id -- | A version of 'fuzzyFind' that searches on the given text field of the data. fuzzyFindOn :: (a -> String) -> [String] -> [a] -> [(Alignment, a)] fuzzyFindOn f query d = d >>= (\s -> toList $ (, s) <$> foldl' (\a q -> (<>) <$> a <*> bestMatch q (f s)) (Just mempty) query ) instance Semigroup Alignment where Alignment n r <> Alignment m s = Alignment (n + m) (mergeResults r s) instance Monoid Alignment where mempty = Alignment 0 mempty type Score = Int -- | An 'Alignment' is a 'Score' together with a 'Result'. Better results have -- higher scores. data Alignment = Alignment { score :: !Score, result :: !Result } deriving (Eq, Ord, Show, Generic) -- | The base score given to a matching character defaultMatchScore :: Int defaultMatchScore = 16 -- | The base score given to a mismatched character defaultMismatchScore :: Int defaultMismatchScore = 0 -- | Bonus points given to characters matching at the beginning of words defaultBoundaryBonus :: Int defaultBoundaryBonus = defaultMatchScore `div` 2 -- | Bonus points given to characters matching a hump of a CamelCase word. -- We subtract a point from the word boundary score, since a word boundary will -- incur a gap penalty. defaultCamelCaseBonus :: Int defaultCamelCaseBonus = defaultBoundaryBonus - 1 -- | Double any bonus points for matching the first pattern of the character. -- This way we strongly prefer starting the match at the beginning of a word. defaultFirstCharBonusMultiplier :: Int defaultFirstCharBonusMultiplier = 2 -- | We prefer consecutive runs of matched characters in the pattern, so we -- impose a penalty for any gaps, which is added to the size of the gap. defaultGapPenalty :: Int defaultGapPenalty = 3 -- | We give a bonus to consecutive matching characters. -- A number about the same as the boundary bonus will prefer -- runs of consecutive characters vs finding acronyms. defaultConsecutiveBonus :: Int defaultConsecutiveBonus = 11 segmentToString :: ResultSegment -> String segmentToString (Gap xs) = xs segmentToString (Match xs) = xs -- | Renders an 'Alignment' as a pair of lines with "*" on the lower line -- indicating the location of pattern matches. highlight :: Alignment -> String highlight (Alignment s (Result segments)) = foldMap segmentToString segments <> "\n" <> foldMap showGaps segments where showGaps (Gap xs) = replicate (length xs) ' ' showGaps (Match xs) = replicate (length xs) '*' highlight' :: Alignment -> Text highlight' = Text.pack . highlight -- | A highly configurable version of 'bestMatch'. bestMatch' :: Int -- ^ Base score for a matching character. See 'defaultMatchScore'. -> Int -- ^ Base score for a mismatched character. See 'defaultMismatchScore'. -> Int -- ^ Additional penalty for a gap. See 'defaultGapPenalty'. -> Int -- ^ Bonus score for a match at the beginning of a word. See 'defaultBoundaryBonus'. -> Int -- ^ Bonus score for a match on a CamelCase hump. See 'defaultCamelCaseBonus'. -> Int -- ^ Bonus multiplier for matching the first character of the pattern. -- See 'defaultFirstCharBonusMultiplier'. -> Int -- ^ Bonus score for each consecutive character matched. -- See 'defaultFirstCharBonusMultiplier'. -> String -- ^ The query pattern. -> String -- ^ The input string. -> Maybe Alignment bestMatch' matchScore mismatchScore gapPenalty boundaryBonus camelCaseBonus firstCharBonusMultiplier consecutiveBonus query str = Alignment (totalScore m nx) . (Result . Seq.fromList) <$> traceback where totalScore i j = if i > m then 0 else (A.index' hs (i :. j)) + (A.index' bonuses (i :. j)) -- table = unlines -- [ unwords -- $ (if y > 0 then show $ str' ! y else " ") -- : [ show (totalScore x y) | x <- [0 .. m] ] -- | y <- [0 .. n] -- ] similarity a b = if a == b || a == toLower b then matchScore else mismatchScore traceback :: Maybe [ResultSegment] traceback = go [Gap $ drop nx str] [] (-1) m nx go r m currOp 0 j = (Gap (take j str) :) <$> case m of [] -> Just r _ -> case currOp of 1 -> Just (Match m : r) 0 -> Just (Gap m : r) -1 -> Nothing go _ _ _ _ 0 = Nothing go r m currOp i j = if similarity (A.index' query' (i - 1)) (A.index' str' (j - 1)) > 0 then case currOp of 0 -> go (Gap m : r) [A.index' str' (j - 1)] 1 (i - 1) (j - 1) _ -> go r (A.index' str' (j - 1) : m) 1 (i - 1) (j - 1) else case currOp of 1 -> go (Match m : r) [A.index' str' (j - 1)] 0 i (j - 1) _ -> go r (A.index' str' (j - 1) : m) 0 i (j - 1) nx = localMax m n 1 0 0 localMax m n j r s = if j > n then r else let s' = totalScore m j in localMax m n (j + 1) (if s' > s then j else r) s' query' = A.fromList A.Seq query :: Array A.U A.Ix1 Char str' = A.fromList A.Seq str :: Array A.U A.Ix1 Char m = A.unSz $ A.size query' n = A.unSz $ A.size str' hs :: Array A.U Ix2 Int hs = M.createArrayST_ (A.Sz (m + 1 :. n + 1)) $ \marr -> do A.forM_ ((0 :. 0) ... (m :. n)) $ \(i :. j) -> if (i == 0 || j == 0) then M.writeM marr (i :. j) 0 else do scoreMatch <- do hprev <- M.readM marr ((i - 1) :. (j - 1)) pure $ hprev + similarity (A.index' query' (i - 1)) (A.index' str' (j - 1)) + A.index' bonuses (i :. j) scoreGap <- do (arr :: Array A.U A.Ix1 Int) <- forM (1 ... j) $ \l -> (\x -> x - (l + gapPenalty)) <$> M.readM marr (i :. (j - l)) pure . fromMaybe 0 $ A.maximumM arr M.writeM marr (i :. j) (scoreMatch `max` scoreGap `max` 0) bonuses = A.makeArray A.Seq (A.Sz (m + 1 :. n + 1)) f :: Array A.U Ix2 Int where f (i :. j) = bonus i j bonus :: Int -> Int -> Int bonus 0 j = 0 bonus i 0 = 0 bonus i j = if similarity (A.index' query' (i - 1)) (A.index' str' (j - 1)) > 0 then multiplier * (boundary + camel + consecutive) else 0 where boundary = if j < 2 || isAlphaNum (A.index' str' (j - 1)) && not (isAlphaNum (A.index' str' (j - 2))) then boundaryBonus else 0 camel = if j > 1 && isLower (A.index' str' (j - 2)) && isUpper (A.index' str' (j - 1)) then camelCaseBonus else 0 multiplier = if i == 1 then firstCharBonusMultiplier else 1 consecutive = let similar = i > 0 && j > 0 && similarity (A.index' query' (i - 1)) (A.index' str' (j - 1)) > 0 afterMatch = i > 1 && j > 1 && similarity (A.index' query' (i - 2)) (A.index' str' (j - 2)) > 0 beforeMatch = i < m && j < n && similarity (A.index' query' i) (A.index' str' j) > 0 in if similar && (afterMatch || beforeMatch) then consecutiveBonus else 0 -- gaps :: String -> Seq ResultSegment -- gaps s = [Gap s] data ResultSegment = Gap !String | Match !String deriving (Eq, Ord, Show, Generic) -- | Concatenating all the 'ResultSegment's should yield the original input string. newtype Result = Result { segments :: Seq ResultSegment } deriving (Eq, Ord, Show, Generic) instance Monoid Result where mempty = Result [] instance Semigroup Result where Result Empty <> as = as as <> Result Empty = as Result (viewr -> h :> Gap []) <> as = Result h <> as as <> Result (viewl -> Gap [] :< t) = as <> Result t Result (viewr -> h :> Match []) <> as = Result h <> as as <> Result (viewl -> Match [] :< t) = as <> Result t Result (viewr -> i :> Gap l) <> Result (viewl -> Gap h :< t) = Result (i <> [Gap (l <> h)] <> t) Result (viewr -> i :> Match l) <> Result (viewl -> Match h :< t) = Result (i <> [Match (l <> h)] <> t) Result a <> Result b = Result (a <> b) mergeResults :: Result -> Result -> Result mergeResults as bs = merge as bs where drop' :: Int -> Result -> Result drop' n m | n < 1 = m drop' n (Result (viewl -> Gap g :< t)) = Result [Gap (drop n g)] <> drop' (n - length g) (Result t) drop' n (Result (viewl -> Match g :< t)) = Result [Match (drop n g)] <> drop' (n - length g) (Result t) merge :: Result -> Result -> Result merge (Result Seq.Empty) ys = ys merge xs (Result Seq.Empty) = xs merge (Result xs) (Result ys ) = case (viewl xs, viewl ys) of (Gap g :< t, Gap g' :< t') | length g <= length g' -> Result [Gap g] <> merge (Result t) (drop' (length g) (Result ys)) | otherwise -> Result [Gap g'] <> merge (drop' (length g') (Result xs)) (Result t') (Match m :< t, Match m' :< t') | length m >= length m' -> Result [Match m] <> merge (Result t) (drop' (length m) (Result ys)) | otherwise -> Result [Match m'] <> merge (drop' (length m') (Result xs)) (Result t') (Gap g :< t, Match m' :< t') -> Result [Match m'] <> merge (drop' (length m') (Result xs)) (Result t') (Match m :< t, Gap g' :< t') -> Result [Match m] <> merge (Result t) (drop' (length m) (Result ys))