module EvalSem (evalSem) where import qualified Data.Map as Map import qualified Data.Set as Set import Reader (readcorpus,Token) import Data.List (foldl',inits,isPrefixOf,sortBy) import Data.Ord (comparing) import SparseVector (plus,scale) import Utils (splitOn) import Data.Char (toLower) import System.Environment import System.IO (stderr,hPutStr) import Control.Exception (assert) import Text.Printf import NLP.Scores (avgPrecision, mean) import Debug.Trace type Word = String type POS = String type ClustID = String type Feat = String type Count = Double type SemLex = Map.Map (Word,POS) (Map.Map Feat Count) type SemClust = Map.Map ClustID [(Feat,Count)] parseEntry :: String -> ((Word,POS),Map.Map Feat Count) parseEntry ln = case words ln of (wp:fs) -> let [w,p] = splitOn ':' wp in ((w,map toLower p),Map.fromList . map (\f -> (f,1)) . splitOn ',' . unwords $ fs) parseLexicon :: String -> SemLex parseLexicon = foldl' f Map.empty . map parseEntry . filter (not . null) . lines where f z (k,v) = Map.insertWith' (Map.unionWith (+)) (v == v `seq` k) v z semClusters :: SemLex -> [((Word,ClustID,POS),Count)] -> SemClust semClusters dict = Map.map (sortBy (flip $ comparing snd) . Map.toList) . Map.fromListWith (plus) . map (\((w,cid,p),c) -> (cid,Map.findWithDefault Map.empty (w,p) dict `scale` c)) evalSem args = do let [details -- be verbose ,lexf -- lexicon file ,trainposf -- POS tagged train file ,trainf -- Cluster labeled train file ,posf -- POS tagged test file ,clustf -- Cluster labeled test file ] = args lex <- fmap parseLexicon $ readFile lexf css <- fmap readcorpus $ readFile trainf cpos <- fmap readcorpus $ readFile trainposf pss <- fmap readcorpus $ readFile posf xss <- fmap readcorpus $ readFile clustf let toks yss zss = Map.toList . Map.fromListWith (+) . map (\k -> (k,1)) . zipWith (\(w,p) (w',cid) -> assert (w == w') (w,cid,p)) (concat yss) . concat $ zss its = filter (\((w,_,p),_) -> (take 1 p `elem` ["n","v"] && Map.member (w,p) lex)) . toks pss $ xss cs = semClusters lex . toks cpos $ css ap ((w,cid,p),c) | read details && trace (show $ Map.findWithDefault [] cid $ cs) False = undefined ap ((w,cid,p),c) = c * (avgPrecision (Map.keysSet . Map.findWithDefault Map.empty (w,p) $ lex) . map fst . Map.findWithDefault [] cid $ cs) aps = map ap $ its :: [Double] hPutStr stderr . unlines . map (\(t,a) -> printf "%-40s %2.3f" (show t) a) . zip its $ aps printf "%2.3f\n" . (/ sum (map snd its)) . sum $ aps