module Numeric.Sibe.NaiveBayes
(Document(..),
NB(..),
initialize,
run,
session,
accuracy,
precision,
recall,
fmeasure,
mean,
stdev,
cleanText,
cleanDocuments,
ngram,
ngramText,
removeWords,
removeStopwords,
)
where
import Numeric.Sibe.Utils
import Numeric.Sibe.NLP
import Data.List
import Debug.Trace
import qualified Data.Set as Set
import Data.List.Split
import Data.Maybe
import Control.Arrow ((&&&))
data NB = NB { documents :: [Document]
, classes :: [(Class, Double)]
, vocabulary :: Int
, megadoc :: String
, cd :: [(Class, [Document])]
, cw :: [(Class, [(String, Int)])]
, cgram :: [(Class, [(String, Int)])]
} deriving (Eq, Show, Read)
initialize :: [Document] -> [Class] -> NB
initialize documents classes =
let megadoc = concatDocs documents
vocabulary = genericLength ((ordNub . words) megadoc)
cls = zip classes (map classPrior classes)
cd = zip classes (map classDocs classes)
cw = zip classes $ map classWordsCounts classes
cgram = zip classes $ map classNGramCounts classes
in NB { documents = documents
, classes = cls
, vocabulary = vocabulary
, megadoc = megadoc
, cd = cd
, cw = cw
, cgram = cgram
}
where
concatDocs = concatMap (\(Document text _) -> text ++ " ")
classDocs x = filter ((==x) . c) documents
classMegadoc = concatMap (\(Document text _) -> text ++ " ") . classDocs
classWords = words . classMegadoc
classNGram = concatMap (\(Document text _) -> text ++ " ") . ngram 2 . classDocs
classNGramWords = words . classNGram
classVocabulary = ordNub . classWords
classPrior x = genericLength (classDocs x) / genericLength documents
countWordInDoc d w = genericLength (filter (==w) d)
wordsCount ws voc =
zip voc $ map (countWordInDoc ws) voc
classWordsCounts x = wordsCount (classWords x) (classVocabulary x)
classNGramCounts x = wordsCount (classNGramWords x) (ordNub $ classNGramWords x)
session :: [Document] -> NB -> [(Class, (Class, Double))]
session docs nb =
let results = map (\(Document text c) -> (c, run text nb)) docs
in results
run :: String -> NB -> (Class, Double)
run txt (NB documents classes vocabulary megadoc cd cw cgram) =
let scores = map (score . fst) classes
index = argmax scores
m = maximum scores
in (fst (classes !! index), m)
where
score c =
let prior = snd (fromJust $ find ((==c) . fst) classes)
alpha = 1 prior
in alpha * product (map (prob c) (words txt))
prob c w =
let fcw = fromJust $ find ((==c) . fst) cw
fcg = fromJust $ find ((==c) . fst) cgram
tctM = find ((== w) . fst) (snd fcw)
tct = if isJust tctM then (snd . fromJust) tctM else 0
cvoc = sum $ map snd (snd fcw)
voc = vocabulary
gram = find ((==w) . last . splitOn "_" . fst) (snd fcg)
pg = if isJust gram then (snd . fromJust) gram else 0
in realToFrac (tct + 1) / realToFrac (cvoc + voc)
argmax :: (Ord a) => [a] -> Int
argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
mean :: [Double] -> Double
mean x = sum x / genericLength x
stdev :: [Double] -> Double
stdev x =
let avg = mean x
variance = sum (map ((^2) . subtract avg) x) / (genericLength x 1)
in sqrt variance
l :: (Show a) => a -> a
l a = trace (show a) a