module Data.SearchEngine.BM25F (
score,
Context(..),
FeatureFunction(..),
Doc(..),
scoreTermsBulk,
Explanation(..),
explain,
) where
import Data.Ix
import Data.Array.Unboxed
data Context term field feature = Context {
numDocsTotal :: !Int,
avgFieldLength :: field -> Float,
numDocsWithTerm :: term -> Int,
paramK1 :: !Float,
paramB :: field -> Float,
fieldWeight :: field -> Float,
featureWeight :: feature -> Float,
featureFunction :: feature -> FeatureFunction
}
data Doc term field feature = Doc {
docFieldLength :: field -> Int,
docFieldTermFrequency :: field -> term -> Int,
docFeatureValue :: feature -> Float
}
score :: (Ix field, Bounded field, Ix feature, Bounded feature) =>
Context term field feature ->
Doc term field feature -> [term] -> Float
score ctx doc terms =
sum (map (weightedTermScore ctx doc) terms)
+ sum (map (weightedNonTermScore ctx doc) features)
where
features = range (minBound, maxBound)
weightedTermScore :: (Ix field, Bounded field) =>
Context term field feature ->
Doc term field feature -> term -> Float
weightedTermScore ctx doc t =
weightIDF ctx t * tf'
/ (k1 + tf')
where
tf' = weightedDocTermFrequency ctx doc t
k1 = paramK1 ctx
weightIDF :: Context term field feature -> term -> Float
weightIDF ctx t =
log ((n n_t + 0.5) / (n_t + 0.5))
where
n = fromIntegral (numDocsTotal ctx)
n_t = fromIntegral (numDocsWithTerm ctx t)
weightedDocTermFrequency :: (Ix field, Bounded field) =>
Context term field feature ->
Doc term field feature -> term -> Float
weightedDocTermFrequency ctx doc t =
sum [ w_f * tf_f / _B_f
| field <- range (minBound, maxBound)
, let w_f = fieldWeight ctx field
tf_f = fromIntegral (docFieldTermFrequency doc field t)
_B_f = lengthNorm ctx doc field
, not (isNaN _B_f)
]
lengthNorm :: Context term field feature ->
Doc term field feature -> field -> Float
lengthNorm ctx doc field =
(1b_f) + b_f * sl_f / avgsl_f
where
b_f = paramB ctx field
sl_f = fromIntegral (docFieldLength doc field)
avgsl_f = avgFieldLength ctx field
weightedNonTermScore :: (Ix feature, Bounded feature) =>
Context term field feature ->
Doc term field feature -> feature -> Float
weightedNonTermScore ctx doc feature =
w_f * _V_f f_f
where
w_f = featureWeight ctx feature
_V_f = applyFeatureFunction (featureFunction ctx feature)
f_f = docFeatureValue doc feature
data FeatureFunction
= LogarithmicFunction Float
| RationalFunction Float
| SigmoidFunction Float Float
applyFeatureFunction :: FeatureFunction -> (Float -> Float)
applyFeatureFunction (LogarithmicFunction p1) = \fi -> log (p1 + fi)
applyFeatureFunction (RationalFunction p1) = \fi -> fi / (p1 + fi)
applyFeatureFunction (SigmoidFunction p1 p2) = \fi -> 1 / (p1 + exp (fi * p2))
scoreTermsBulk :: forall field term feature. (Ix field, Bounded field) =>
Context term field feature ->
Doc term field feature ->
(term -> (field -> Int) -> Float)
scoreTermsBulk ctx doc =
\t tFreq ->
let !tf' = sum [ w!f * tf_f / _B!f
| f <- range (minBound, maxBound)
, let tf_f = fromIntegral (tFreq f)
_B_f = _B!f
, not (isNaN _B_f)
]
in weightIDF ctx t * tf'
/ (k1 + tf')
where
!k1 = paramK1 ctx
w, _B :: UArray field Float
!w = array (minBound, maxBound)
[ (field, fieldWeight ctx field)
| field <- range (minBound, maxBound) ]
!_B = array (minBound, maxBound)
[ (field, lengthNorm ctx doc field)
| field <- range (minBound, maxBound) ]
data Explanation field feature term = Explanation {
overallScore :: Float,
termScores :: [(term, Float)],
nonTermScores :: [(feature, Float)],
termFieldScores :: [(term, [(field, Float)])]
}
deriving Show
instance Functor (Explanation field feature) where
fmap f e@Explanation{..} =
e {
termScores = [ (f t, s) | (t, s) <- termScores ],
termFieldScores = [ (f t, fs) | (t, fs) <- termFieldScores ]
}
explain :: (Ix field, Bounded field, Ix feature, Bounded feature) =>
Context term field feature ->
Doc term field feature -> [term] -> Explanation field feature term
explain ctx doc ts =
Explanation {..}
where
overallScore = sum (map snd termScores)
+ sum (map snd nonTermScores)
termScores = [ (t, weightedTermScore ctx doc t) | t <- ts ]
nonTermScores = [ (feature, weightedNonTermScore ctx doc feature)
| feature <- range (minBound, maxBound) ]
termFieldScores =
[ (t, fieldScores)
| t <- ts
, let fieldScores =
[ (f, weightedTermScore ctx' doc t)
| f <- range (minBound, maxBound)
, let ctx' = ctx { fieldWeight = fieldWeightOnly f }
]
]
fieldWeightOnly f f' | sameField f f' = fieldWeight ctx f'
| otherwise = 0
sameField f f' = index (minBound, maxBound) f
== index (minBound, maxBound) f'