module NLP.Similarity.VectorSim where
import Data.DefaultMap (DefaultMap)
import qualified Data.DefaultMap as DM
import qualified Data.Set as Set
import Data.Text (Text)
import qualified Data.Text as T
import Data.List (elemIndices)
import NLP.Types
type TermVector = DefaultMap Text Double
mkVector :: Corpus -> [Text] -> TermVector
mkVector corpus doc = DM.fromList 0 $ Set.toList $
Set.map (\t->(t, tf_idf t doc corpus)) (Set.fromList doc)
sim :: Corpus -> Text -> Text -> Double
sim corpus doc1 doc2 = similarity corpus (T.words doc1) (T.words doc2)
similarity :: Corpus -> [Text] -> [Text] -> Double
similarity corpus doc1 doc2 = let
vec1 = mkVector corpus doc1
vec2 = mkVector corpus doc2
in tvSim vec1 vec2
tvSim :: TermVector -> TermVector -> Double
tvSim doc1 doc2 = let
theCos = cosVec doc1 doc2
in if isNaN theCos then 0 else theCos
tf :: Eq a => a -> [a] -> Int
tf term doc = length $ elemIndices term doc
idf :: Text -> Corpus -> Double
idf term corpus = let
docCount = corpLength corpus
containedInCount = 1 + termCounts corpus term
in log (fromIntegral docCount / fromIntegral containedInCount)
tf_idf :: Text -> [Text] -> Corpus -> Double
tf_idf term doc corp = let
corpus = addDocument corp doc
freq = tf term doc
result | freq == 0 = 0
| otherwise = (fromIntegral freq) * idf term corpus
in result
cosVec :: TermVector -> TermVector -> Double
cosVec vec1 vec2 = let
dp = dotProd vec1 vec2
mag = (magnitude vec1 * magnitude vec2)
in dp / mag
magnitude :: TermVector -> Double
magnitude v = sqrt $ DM.foldl acc 0 v
where
acc :: Double -> Double -> Double
acc cur new = cur + (new ** 2)
dotProd :: TermVector -> TermVector -> Double
dotProd xs ys = let
terms = Set.fromList (DM.keys xs) `Set.union` Set.fromList (DM.keys ys)
in Set.foldl (+) 0 (Set.map (\t -> (DM.lookup t xs) * (DM.lookup t ys)) terms)