module NLP.Chunk.AvgPerceptronChunker
( mkChunker
, trainInt
, chunk
, chunkSentence
, Chunker(..)
, chunkerID
, readChunker
)
where
import NLP.ML.AvgPerceptron ( Perceptron, Feature(..)
, Class(..), predict, update
, averageWeights)
import NLP.Types
import Data.ByteString (ByteString)
import Data.List (foldl', group)
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as Map
import Data.Either (rights)
import Data.Maybe (fromMaybe)
import Data.Serialize (encode, decode)
import Data.Text (Text)
import qualified Data.Text as T
import System.Random.Shuffle (shuffleM)
data Chunker c t = Chunker
{ chChunker :: [TaggedSentence t] -> [ChunkedSentence c t]
, chTrainer :: [ChunkedSentence c t] -> IO (Chunker c t)
, chSerialize :: ByteString
, chId :: ByteString
}
chunkerID :: ByteString
chunkerID = "NLP.Chunk.AvgPerceptronChunker"
readChunker :: (ChunkTag c, Tag t) => ByteString -> Either String (Chunker c t)
readChunker bs = do
model <- decode bs
return $ mkChunker model
itterations :: Int
itterations = 5
mkChunker :: (ChunkTag c, Tag t) => Perceptron -> Chunker c t
mkChunker per = Chunker { chChunker = chunk per
, chTrainer = \exs -> do
newPer <- trainInt itterations per exs
return $ mkChunker newPer
, chSerialize = encode per
, chId = chunkerID
}
chunk :: (ChunkTag c, Tag t) => Perceptron -> [TaggedSentence t] -> [ChunkedSentence c t]
chunk per corpus = map (chunkSentence per) corpus
chunkSentence :: (ChunkTag c, Tag t) => Perceptron -> TaggedSentence t -> ChunkedSentence c t
chunkSentence per (TaggedSent sent) = let
chunks = [Class "-START-"] ++ map (predictChunk per) features
features = zipWith3 (getFeatures sent)
[0..]
sent
chunks
chunkTags = map (\(Class c) -> parseChunk $ T.pack c) $ drop 1 chunks
in toTree (rights $ chunkTags) sent
predictChunk :: Perceptron -> Map Feature Int -> Class
predictChunk model feats =
let predicted = predict model feats
theClass = fromMaybe (Class "O") predicted
in theClass
toTree :: (ChunkTag c, Tag t) => [c] -> [POS t] -> ChunkedSentence c t
toTree chunks tags =
let groups = map (\g -> (head g, length g)) $ group chunks
groupTags [] _ = []
groupTags ((g, c):gs) tags = (g, take c tags):(groupTags gs $ drop c tags)
in ChunkedSent $ concatMap toChunkOr (groupTags groups tags)
toChunkOr :: (ChunkTag c, Tag t) => (c, [POS t]) -> [ChunkOr c t]
toChunkOr (c, tags) | c == notChunk = map POS_CN tags
| otherwise = [Chunk_CN (Chunk c $ map POS_CN tags)]
trainInt :: (ChunkTag c, Tag t) =>
Int
-> Perceptron
-> [ChunkedSentence c t]
-> IO Perceptron
trainInt itr per examples = trainCls itr per $ toClassLst $ map unzipChunks examples
toClassLst :: (ChunkTag c, Tag t) => [(TaggedSentence t, [c])] -> [(TaggedSentence t, [Class])]
toClassLst tagged = map (\(x, y)->(x, map (Class . T.unpack . fromChunk) y)) tagged
trainCls :: Tag t => Int -> Perceptron -> [(TaggedSentence t, [Class])] -> IO Perceptron
trainCls itr per examples = do
trainingSet <- shuffleM $ concat $ take itr $ repeat examples
return $ averageWeights $ foldl' trainSentence per trainingSet
startToks :: Tag t => [POS t]
startToks = [POS startTag (Token "-START-")]
endToks :: Tag t => [POS t]
endToks = [POS endTag (Token "-END-")]
trainSentence :: Tag t => Perceptron -> (TaggedSentence t, [Class]) -> Perceptron
trainSentence per (TaggedSent sent, ts) = let
tags = [Class "-START-"] ++ ts
features = zipWith3 (getFeatures sent)
[0..]
sent
tags
fn :: Perceptron -> (Map Feature Int, Class) -> Perceptron
fn model (feats, truth) = let
guess = predictChunk model feats
in update model truth guess $ Map.keys feats
in foldl' fn per (zip features ts)
getFeatures :: Tag t =>
[POS t]
-> Int
-> POS t
-> Class
-> Map Feature Int
getFeatures tagged idx word prev = let
context = startToks ++ tagged ++ endToks
i = idx + 1
add :: Map Feature Int -> [Text] -> Map Feature Int
add m args = Map.alter increment (mkFeature $ T.intercalate " " args) m
increment :: Maybe Int -> Maybe Int
increment Nothing = Just 1
increment (Just w) = Just (w + 1)
features :: [[Text]]
features = [ ["pos", showPOStag word]
, ["word", showPOStok word]
, ["prevpos", showPOStag (context!!(i1))]
, ["prevpos+pos", T.intercalate "+" $ map showPOStag
[ context!!(i1), word ]
]
, ["pos+nextpos", T.intercalate "+" $ map showPOStag
[word, context!!(i+1) ]
]
, ["tags-since-dt", tagsSinceDt $ take idx tagged]
]
in foldl' add Map.empty features
tagsSinceDt :: Tag t => [POS t] -> Text
tagsSinceDt posToks =
T.intercalate "-" $ tagsSinceHelper $ reverse posToks
tagsSinceHelper :: Tag t => [POS t] -> [Text]
tagsSinceHelper [] = []
tagsSinceHelper (pt@(POS t _):ts)
| isDt t = []
| otherwise = [ (showPOStag pt) ] ++ (tagsSinceHelper ts)
mkFeature :: Text -> Feature
mkFeature txt = Feat $ T.copy txt