Safe Haskell | Safe-Inferred |
---|---|
Language | Haskell2010 |
Synopsis
- data FuzzySet = FuzzySet {
- exactSet :: !(HashMap Text Text)
- matchDict :: !(HashMap Text [GramInfo])
- items :: !(HashMap Int (Vector FuzzySetItem))
- gramSizeLower :: !Int
- gramSizeUpper :: !Int
- useLevenshtein :: !Bool
- data FuzzySetItem = FuzzySetItem {
- vectorMagnitude :: !Double
- normalizedEntry :: !Text
- data GramInfo = GramInfo {}
- type FuzzyMatch = (Double, Text)
- grams :: Text -> Int -> [Text]
- gramVector :: Text -> Int -> HashMap Text Int
- matches :: FuzzySet -> HashMap Text Int -> HashMap Int Int
- getMatches :: FuzzySet -> Text -> Double -> Int -> [FuzzyMatch]
- add_ :: MonadState FuzzySet m => Text -> m Bool
- addMany_ :: MonadState FuzzySet m => [Text] -> m [Text]
- normalized :: Text -> Text
- norm :: [Int] -> Double
- distance :: Text -> Text -> Double
Documentation
Main fuzzy string set data type.
FuzzySet | |
|
Instances
Show FuzzySet Source # | |
Eq FuzzySet Source # | |
Monad m => MonadState FuzzySet (FuzzySearchT m) Source # | |
Defined in Data.FuzzySet.Monad get :: FuzzySearchT m FuzzySet # put :: FuzzySet -> FuzzySearchT m () # state :: (FuzzySet -> (a, FuzzySet)) -> FuzzySearchT m a # | |
MonadFuzzySearch m => MonadFuzzySearch (StateT FuzzySet m) Source # | |
data FuzzySetItem Source #
Instances
Show FuzzySetItem Source # | |
Defined in Data.FuzzySet.Internal showsPrec :: Int -> FuzzySetItem -> ShowS # show :: FuzzySetItem -> String # showList :: [FuzzySetItem] -> ShowS # | |
Eq FuzzySetItem Source # | |
Defined in Data.FuzzySet.Internal (==) :: FuzzySetItem -> FuzzySetItem -> Bool # (/=) :: FuzzySetItem -> FuzzySetItem -> Bool # |
type FuzzyMatch = (Double, Text) Source #
An individual result when looking up a string in the set, consisting of
- a similarity score in the range \([0, 1]\), and
- the matching string.
grams :: Text -> Int -> [Text] Source #
Break apart the input string into a list of n-grams. The string is first
normalized
and enclosed in hyphens. We then take all
substrings of length n, letting the offset range from \(0 \text{ to } s + 2 − n\),
where s is the length of the normalized input.
Example:
The string "Destroido Corp."
is first normalized to "destroido corp"
,
and then enclosed in hyphens, so that it becomes "-destroido corp-"
. The
trigrams generated from this normalized string are:
[ "-de" , "des" , "est" , "str" , "tro" , "roi" , "oid" , "ido" , "do " , "o c" , " co" , "cor" , "orp" , "rp-" ]
gramVector :: Text -> Int -> HashMap Text Int Source #
Generate a list of n-grams (character substrings) from the normalized input and then translate this into a dictionary with the n-grams as keys mapping to the number of occurences of the substring in the list.
>>>
gramVector "xxxx" 2
fromList [("-x",1), ("xx",3), ("x-",1)]
The substring "xx"
appears three times in the normalized string:
>>>
grams "xxxx" 2
["-x","xx","xx","xx","x-"]
>>>
Data.HashMap.Strict.lookup "nts" (gramVector "intrent'srestaurantsomeoftrent'saunt'santswantsamtorentsomepants" 3)
Just 8
getMatches :: FuzzySet -> Text -> Double -> Int -> [FuzzyMatch] Source #
normalized :: Text -> Text Source #
Normalize the input by
- removing non-word characters, except for spaces and commas; and
- converting alphabetic characters to lowercase.