-- | -- Module : Data.Text.Metrics -- Copyright : © 2016 Mark Karpov -- License : BSD 3 clause -- -- Maintainer : Mark Karpov -- Stability : experimental -- Portability : portable -- -- The module provides efficient implementations of various strings metrics. -- It works with strict 'Text' values and returns either 'Natural' numbers -- (because the metrics cannot be negative), or @'Ratio' 'Natural'@ values -- because returned values are rational non-negative numbers by definition. -- -- The functions provided here are the fastest implementations available for -- use in Haskell programs. In fact the functions are implemented in C for -- maximal efficiency, but this leads to a minor flaw. When we work with -- 'Text' values in C, they are represented as UTF-16 encoded strings of -- two-byte values. The algorithms treat the strings as if a character -- corresponds to one element in such strings, which is true for almost all -- modern text data. However, there are characters that are represented by -- two adjoined elements in UTF-16: emoji, historic scripts, less used -- Chinese ideographs, and some more. If input 'Text' of the functions -- contains such characters, the functions may return slightly incorrect -- result. Decide for yourself if this is acceptable for your use case, but -- chances are you will never run into situations when the functions produce -- incorrect results. {-# LANGUAGE CPP #-} {-# LANGUAGE ForeignFunctionInterface #-} {-# LANGUAGE OverloadedStrings #-} module Data.Text.Metrics ( -- * Levenshtein variants levenshtein , levenshteinNorm , damerauLevenshtein , damerauLevenshteinNorm -- * Other , hamming , jaro , jaroWinkler ) where import Data.Ratio import Data.Text import Foreign import Foreign.C.Types import Numeric.Natural import System.IO.Unsafe import qualified Data.Text as T import qualified Data.Text.Foreign as TF #if !MIN_VERSION_base(4,8,0) import Control.Applicative #endif ---------------------------------------------------------------------------- -- Levenshtein variants -- | Return Levenshtein distance between two 'Text' values. Classic -- Levenshtein distance between two strings is minimal number of operations -- necessary to transform one string into another. For Levenshtein distance -- allowed operations are: deletion, insertion, and substitution. -- -- See also: . levenshtein :: Text -> Text -> Natural levenshtein = withTwo c_levenshtein foreign import ccall unsafe "tmetrics_levenshtein" c_levenshtein :: CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> IO CUInt -- | Return normalized Levenshtein distance between two 'Text' values. -- Result is a non-negative rational number (represented as @'Ratio' -- 'Natural'@), where 0 signifies no similarity between the strings, while 1 -- means exact match. The operation is virtually as fast as 'levenshtein'. -- -- See also: . levenshteinNorm :: Text -> Text -> Ratio Natural levenshteinNorm = norm levenshtein {-# INLINE levenshteinNorm #-} -- | Return Damerau-Levenshtein distance between two 'Text' values. The -- function works like 'levenshtein', but the collection of allowed -- operations also includes transposition of two /adjacent/ characters. The -- function is about 20% slower than 'levenshtein', but still pretty fast. -- -- See also: . damerauLevenshtein :: Text -> Text -> Natural damerauLevenshtein = withTwo c_damerau_levenshtein foreign import ccall unsafe "tmetrics_damerau_levenshtein" c_damerau_levenshtein :: CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> IO CUInt -- | Return normalized Damerau-Levenshtein distance between two 'Text' -- values. Result is a non-negative rational number (represented as @'Ratio' -- 'Natural'@), where 0 signifies no similarity between the strings, while 1 -- means exact match. The operation is virtually as fast as -- 'damerauLevenshtein'. -- -- See also: . damerauLevenshteinNorm :: Text -> Text -> Ratio Natural damerauLevenshteinNorm = norm damerauLevenshtein {-# INLINE damerauLevenshteinNorm #-} ---------------------------------------------------------------------------- -- Other -- | /O(n)/ Return Hamming distance between two 'Text' values. Hamming -- distance is defined as number of positions at which the corresponding -- symbols are different. The input 'Text' values should be of equal length -- or 'Nothing' will be returned. -- -- See also: . hamming :: Text -> Text -> Maybe Natural hamming a b = if T.length a == T.length b then Just . unsafePerformIO . TF.useAsPtr a $ \aptr size -> TF.useAsPtr b $ \bptr _ -> fromIntegral <$> c_hamming (fromIntegral size) aptr bptr else Nothing foreign import ccall unsafe "tmetrics_hamming" c_hamming :: CUInt -> Ptr Word16 -> Ptr Word16 -> IO CUInt -- | Return Jaro distance between two 'Text' values. Returned value is in -- range from 0 (no similarity) to 1 (exact match). -- -- While the algorithm is pretty clear for artificial examples (like those -- from the linked Wikipedia article), for /arbitrary/ strings, it may be -- hard to decide which of two strings should be considered as one having -- “reference” order of characters (since order of matching characters in an -- essential part of the definition of the algorithm). This makes us -- consider the first string the “reference” string (with correct order of -- characters). Thus generally, -- -- > jaro a b ≠ jaro b a -- -- This asymmetry can be found in all implementations of the algorithm on -- the internet, AFAIK. -- -- See also: -- -- @since 0.2.0 jaro :: Text -> Text -> Ratio Natural jaro = jaroCommon (\_ _ _ _ x -> return x) jaroCommon :: (CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> Ratio Natural -> IO (Ratio Natural)) -> Text -> Text -> Ratio Natural jaroCommon f a b = unsafePerformIO $ alloca $ \m' -> alloca $ \t' -> TF.useAsPtr a $ \aptr asize -> TF.useAsPtr b $ \bptr bsize -> if asize == 0 || bsize == 0 then return (1 % 1) else do let asize' = fromIntegral asize bsize' = fromIntegral bsize c_jaro m' t' asize' aptr bsize' bptr m <- fromIntegral <$> peek m' t <- fromIntegral <$> peek t' f asize' aptr bsize' bptr $ if m == 0 then 0 else ((m % fromIntegral asize) + (m % fromIntegral bsize) + ((m - t) % m)) / 3 {-# INLINE jaroCommon #-} foreign import ccall unsafe "tmetrics_jaro" c_jaro :: Ptr CUInt -> Ptr CUInt -> CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> IO () -- | Return Jaro-Winkler distance between two 'Text' values. Returned value -- is in range from 0 (no similarity) to 1 (exact match). -- -- See also: -- -- @since 0.2.0 jaroWinkler :: Text -> Text -> Ratio Natural jaroWinkler = jaroCommon g where g asize aptr bsize bptr dj = do l <- fromIntegral <$> c_common_prefix asize aptr bsize bptr return (dj + (1 % 10) * l * (1 - dj)) foreign import ccall unsafe "tmetrics_common_prefix" c_common_prefix :: CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> IO CUInt ---------------------------------------------------------------------------- -- Helpers withTwo :: (CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> IO CUInt) -> Text -> Text -> Natural withTwo f a b = unsafePerformIO . TF.useAsPtr a $ \aptr asize -> TF.useAsPtr b $ \bptr bsize -> fromIntegral <$> f (fromIntegral asize) aptr (fromIntegral bsize) bptr {-# INLINE withTwo #-} norm :: (Text -> Text -> Natural) -> Text -> Text -> Ratio Natural norm f a b = let r = f a b in if r == 0 then 1 % 1 else 1 % 1 - r % fromIntegral (max (T.length a) (T.length b)) {-# INLINE norm #-}