{-# LANGUAGE RecordWildCards, BangPatterns, ScopedTypeVariables #-} -- | An implementation of BM25F ranking. See: -- -- * A quick overview: -- -- * /The Probabilistic Relevance Framework: BM25 and Beyond/ -- -- -- * /An Introduction to Information Retrieval/ -- -- module Data.SearchEngine.BM25F ( -- * The ranking function score, Context(..), FeatureFunction(..), Doc(..), -- ** Specialised variants scoreTermsBulk, -- * Explaining the score Explanation(..), explain, ) where import Data.Ix import Data.Array.Unboxed data Context term field feature = Context { numDocsTotal :: !Int, avgFieldLength :: field -> Float, numDocsWithTerm :: term -> Int, paramK1 :: !Float, paramB :: field -> Float, -- consider minimum length to prevent massive B bonus? fieldWeight :: field -> Float, featureWeight :: feature -> Float, featureFunction :: feature -> FeatureFunction } data Doc term field feature = Doc { docFieldLength :: field -> Int, docFieldTermFrequency :: field -> term -> Int, docFeatureValue :: feature -> Float } -- | The BM25F score for a document for a given set of terms. -- score :: (Ix field, Bounded field, Ix feature, Bounded feature) => Context term field feature -> Doc term field feature -> [term] -> Float score ctx doc terms = sum (map (weightedTermScore ctx doc) terms) + sum (map (weightedNonTermScore ctx doc) features) where features = range (minBound, maxBound) weightedTermScore :: (Ix field, Bounded field) => Context term field feature -> Doc term field feature -> term -> Float weightedTermScore ctx doc t = weightIDF ctx t * tf' / (k1 + tf') where tf' = weightedDocTermFrequency ctx doc t k1 = paramK1 ctx weightIDF :: Context term field feature -> term -> Float weightIDF ctx t = log ((n - n_t + 0.5) / (n_t + 0.5)) where n = fromIntegral (numDocsTotal ctx) n_t = fromIntegral (numDocsWithTerm ctx t) weightedDocTermFrequency :: (Ix field, Bounded field) => Context term field feature -> Doc term field feature -> term -> Float weightedDocTermFrequency ctx doc t = sum [ w_f * tf_f / _B_f | field <- range (minBound, maxBound) , let w_f = fieldWeight ctx field tf_f = fromIntegral (docFieldTermFrequency doc field t) _B_f = lengthNorm ctx doc field ] lengthNorm :: Context term field feature -> Doc term field feature -> field -> Float lengthNorm ctx doc field = (1-b_f) + b_f * sl_f / avgsl_f where b_f = paramB ctx field sl_f = fromIntegral (docFieldLength doc field) avgsl_f = avgFieldLength ctx field weightedNonTermScore :: (Ix feature, Bounded feature) => Context term field feature -> Doc term field feature -> feature -> Float weightedNonTermScore ctx doc feature = w_f * _V_f f_f where w_f = featureWeight ctx feature _V_f = applyFeatureFunction (featureFunction ctx feature) f_f = docFeatureValue doc feature data FeatureFunction = LogarithmicFunction Float -- ^ @log (\lambda_i + f_i)@ | RationalFunction Float -- ^ @f_i / (\lambda_i + f_i)@ | SigmoidFunction Float Float -- ^ @1 / (\lambda + exp(-(\lambda' * f_i))@ applyFeatureFunction :: FeatureFunction -> (Float -> Float) applyFeatureFunction (LogarithmicFunction p1) = \fi -> log (p1 + fi) applyFeatureFunction (RationalFunction p1) = \fi -> fi / (p1 + fi) applyFeatureFunction (SigmoidFunction p1 p2) = \fi -> 1 / (p1 + exp (-fi * p2)) ----------------------------- -- Bulk scoring of many terms -- -- | Most of the time we want to score several different documents for the same -- set of terms, but sometimes we want to score one document for many terms -- and in that case we can save a bit of work by doing it in bulk. It lets us -- calculate once and share things that depend only on the document, and not -- the term. -- -- To take advantage of the sharing you must partially apply and name the -- per-doc score functon, e.g. -- -- > let score :: term -> (field -> Int) -> Float -- > score = BM25.bulkScorer ctx doc -- > in sum [ score t (\f -> counts ! (t, f)) | t <- ts ] -- scoreTermsBulk :: forall field term feature. (Ix field, Bounded field) => Context term field feature -> Doc term field feature -> (term -> (field -> Int) -> Float) scoreTermsBulk ctx doc = -- This is just a rearrangement of weightedTermScore and -- weightedDocTermFrequency above, with the doc-constant bits hoisted out. \t tFreq -> let !tf' = sum [ w!f * tf_f / _B!f | f <- range (minBound, maxBound) , let tf_f = fromIntegral (tFreq f) ] in weightIDF ctx t * tf' / (k1 + tf') where -- So long as the caller does the partial application thing then these -- values can all be shared between many calls with different terms. !k1 = paramK1 ctx w, _B :: UArray field Float !w = array (minBound, maxBound) [ (field, fieldWeight ctx field) | field <- range (minBound, maxBound) ] !_B = array (minBound, maxBound) [ (field, lengthNorm ctx doc field) | field <- range (minBound, maxBound) ] ------------------ -- Explanation -- -- | A breakdown of the BM25F score, to explain somewhat how it relates to -- the inputs, and so you can compare the scores of different documents. -- data Explanation field feature term = Explanation { -- | The overall score is the sum of the 'termScores', 'positionScore' -- and 'nonTermScore' overallScore :: Float, -- | There is a score contribution from each query term. This is the -- score for the term across all fields in the document (but see -- 'termFieldScores'). termScores :: [(term, Float)], {- -- | There is a score contribution for positional information. Terms -- appearing in the document close together give a bonus. positionScore :: [(field, Float)], -} -- | The document can have an inate bonus score independent of the terms -- in the query. For example this might be a popularity score. nonTermScores :: [(feature, Float)], -- | This does /not/ contribute to the 'overallScore'. It is an -- indication of how the 'termScores' relates to per-field scores. -- Note however that the term score for all fields is /not/ simply -- sum of the per-field scores. The point of the BM25F scoring function -- is that a linear combination of per-field scores is wrong, and BM25F -- does a more cunning non-linear combination. -- -- However, it is still useful as an indication to see scores for each -- field for a term, to see how the compare. -- termFieldScores :: [(term, [(field, Float)])] } deriving Show instance Functor (Explanation field feature) where fmap f e@Explanation{..} = e { termScores = [ (f t, s) | (t, s) <- termScores ], termFieldScores = [ (f t, fs) | (t, fs) <- termFieldScores ] } explain :: (Ix field, Bounded field, Ix feature, Bounded feature) => Context term field feature -> Doc term field feature -> [term] -> Explanation field feature term explain ctx doc ts = Explanation {..} where overallScore = sum (map snd termScores) -- + sum (map snd positionScore) + sum (map snd nonTermScores) termScores = [ (t, weightedTermScore ctx doc t) | t <- ts ] -- positionScore = [ (f, 0) | f <- range (minBound, maxBound) ] nonTermScores = [ (feature, weightedNonTermScore ctx doc feature) | feature <- range (minBound, maxBound) ] termFieldScores = [ (t, fieldScores) | t <- ts , let fieldScores = [ (f, weightedTermScore ctx' doc t) | f <- range (minBound, maxBound) , let ctx' = ctx { fieldWeight = fieldWeightOnly f } ] ] fieldWeightOnly f f' | sameField f f' = fieldWeight ctx f' | otherwise = 0 sameField f f' = index (minBound, maxBound) f == index (minBound, maxBound) f'