module NLP.POS
( tag
, tagStr
, tagText
, train
, trainStr
, trainText
, tagTokens
, eval
, serialize
, deserialize
, taggerTable
, saveTagger
, loadTagger
, defaultTagger
, conllTagger
, brownTagger
)
where
import Codec.Compression.GZip (decompress)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
import Data.List (isSuffixOf)
import Data.Map (Map)
import qualified Data.Map as Map
import Data.Serialize (decode, encode)
import Data.Text (Text)
import qualified Data.Text as T
import System.FilePath ((</>))
import NLP.Corpora.Parsing (readPOS)
import NLP.Tokenize.Text (tokenize)
import NLP.Types ( POSTagger(..), Sentence, POS(..)
, combine, Tag (..), unTS, tsLength
, TaggedSentence(..), stripTags
, printTS)
import qualified NLP.POS.AvgPerceptronTagger as Avg
import qualified NLP.POS.LiteralTagger as LT
import qualified NLP.POS.UnambiguousTagger as UT
import qualified NLP.Corpora.Brown as B
import qualified NLP.Corpora.Conll as C
import Paths_chatter
defaultTagger :: IO (POSTagger C.Tag)
defaultTagger = conllTagger
conllTagger :: IO (POSTagger C.Tag)
conllTagger = do
dir <- getDataDir
loadTagger (dir </> "data" </> "models" </> "conll2000.pos.model.gz")
brownTagger :: IO (POSTagger B.Tag)
brownTagger = do
dir <- getDataDir
loadTagger (dir </> "data" </> "models" </> "brown.pos.model.gz")
taggerTable :: Tag t => Map ByteString
(ByteString -> Maybe (POSTagger t) -> Either String (POSTagger t))
taggerTable = Map.fromList
[ (LT.taggerID, LT.readTagger)
, (Avg.taggerID, Avg.readTagger)
, (UT.taggerID, UT.readTagger)
]
saveTagger :: Tag t => POSTagger t -> FilePath -> IO ()
saveTagger tagger file = BS.writeFile file (serialize tagger)
loadTagger :: Tag t => FilePath -> IO (POSTagger t)
loadTagger file = do
content <- getContent file
case deserialize taggerTable content of
Left err -> error err
Right tgr -> return tgr
where
getContent :: FilePath -> IO ByteString
getContent f | ".gz" `isSuffixOf` file = fmap (LBS.toStrict . decompress) $ LBS.readFile f
| otherwise = BS.readFile f
serialize :: Tag t => POSTagger t -> ByteString
serialize tagger =
let backoff = case posBackoff tagger of
Nothing -> Nothing
Just btgr -> Just $ serialize btgr
in encode ( posID tagger
, posSerialize tagger
, backoff
)
deserialize :: Tag t =>
Map ByteString
(ByteString -> Maybe (POSTagger t) -> Either String (POSTagger t))
-> ByteString
-> Either String (POSTagger t)
deserialize table bs = do
(theID, theTgr, mBackoff) <- decode bs
backoff <- case mBackoff of
Nothing -> Right Nothing
Just str -> Just `fmap` (deserialize table str)
case Map.lookup theID table of
Nothing -> Left ("Could not find ID in POSTagger function map: " ++ show theID)
Just fn -> fn theTgr backoff
tag :: Tag t => POSTagger t -> Text -> [TaggedSentence t]
tag p txt = let sentences = (posSplitter p) txt
tokens = map (posTokenizer p) sentences
in tagTokens p tokens
tagTokens :: Tag t => POSTagger t -> [Sentence] -> [TaggedSentence t]
tagTokens p tokens = let priority = (posTagger p) tokens
in case posBackoff p of
Nothing -> priority
Just tgr -> combine priority (tagTokens tgr tokens)
tagStr :: Tag t => POSTagger t -> String -> String
tagStr tgr = T.unpack . tagText tgr . T.pack
tagText :: Tag t => POSTagger t -> Text -> Text
tagText tgr txt = T.intercalate " " $ map printTS $ tag tgr txt
trainStr :: Tag t => POSTagger t -> String -> IO (POSTagger t)
trainStr tgr = trainText tgr . T.pack
trainText :: Tag t => POSTagger t -> Text -> IO (POSTagger t)
trainText p exs = train p (map readPOS $ tokenize exs)
train :: Tag t => POSTagger t -> [TaggedSentence t] -> IO (POSTagger t)
train p exs = do
let
trainBackoff = case posBackoff p of
Nothing -> return $ Nothing
Just b -> do tgr <- train b exs
return $ Just tgr
trainer = posTrainer p
newTgr <- trainer exs
newBackoff <- trainBackoff
return (newTgr { posBackoff = newBackoff })
eval :: Tag t => POSTagger t -> [TaggedSentence t] -> Double
eval tgr oracle = let
sentences = map stripTags oracle
results = (posTagger tgr) sentences
totalTokens = fromIntegral $ sum $ map tsLength oracle
isMatch :: Tag t => POS t -> POS t -> Double
isMatch (POS rTag _) (POS oTag _) | rTag == oTag = 1
| otherwise = 0
in (sum $ zipWith isMatch (concatMap unTS results) (concatMap unTS oracle)) / totalTokens