module NLP.Concraft.Polish
(
C.Concraft
, C.saveModel
, C.loadModel
, tag
, tag'
, tagSent
, train
) where
import System.IO.Unsafe (unsafeInterleaveIO)
import Control.Applicative ((<$>))
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Set as S
import qualified Data.Tagset.Positional as P
import qualified Numeric.SGD as SGD
import qualified NLP.Concraft.Schema as S
import NLP.Concraft.Schema (SchemaConf(..), entry, entryWith)
import qualified NLP.Concraft.Guess as G
import qualified NLP.Concraft.Disamb as D
import qualified NLP.Concraft as C
import NLP.Concraft.Polish.Morphosyntax hiding (tag)
import NLP.Concraft.Polish.Maca
guessConfDefault :: SchemaConf
guessConfDefault = S.nullConf
{ lowPrefixesC = entryWith [1, 2] [0]
, lowSuffixesC = entryWith [1, 2] [0]
, knownC = entry [0]
, begPackedC = entry [0] }
disambConfDefault :: SchemaConf
disambConfDefault = S.nullConf
{ lowOrthC = entry [1, 0, 1]
, lowPrefixesC = oov $ entryWith [1, 2, 3] [0]
, lowSuffixesC = oov $ entryWith [1, 2, 3] [0]
, begPackedC = oov $ entry [0] }
where
oov (Just body) = Just $ body { S.oovOnly = True }
oov Nothing = Nothing
tiersDefault :: [D.Tier]
tiersDefault =
[tier1, tier2]
where
tier1 = D.Tier True $ S.fromList ["cas", "per"]
tier2 = D.Tier False $ S.fromList
[ "nmb", "gnd", "deg", "asp" , "ngt", "acm"
, "acn", "ppr", "agg", "vlc", "dot" ]
tag :: MacaPool -> C.Concraft -> T.Text -> IO [Sent Tag]
tag pool concraft inp = map (tagSent concraft) <$> macaPar pool inp
tag' :: MacaPool -> C.Concraft -> L.Text -> IO [[Sent Tag]]
tag' pool concraft
= lazyMapM (tag pool concraft . L.toStrict)
. map L.strip
. L.splitOn "\n\n"
tagSent :: C.Concraft -> Sent Tag -> Sent Tag
tagSent concraft inp =
let tagset = C.tagset concraft
packed = packSentTag tagset inp
xs = C.tag concraft packed
in embedSent inp $ map (P.showTag tagset) xs
lazyMapM :: (a -> IO b) -> [a] -> IO [b]
lazyMapM f (x:xs) = do
y <- f x
ys <- unsafeInterleaveIO $ lazyMapM f xs
return (y:ys)
lazyMapM _ [] = return []
train
:: SGD.SgdArgs
-> P.Tagset
-> Int
-> [SentO Tag]
-> Maybe [SentO Tag]
-> IO C.Concraft
train sgdArgs tagset guessNum train0 eval0 = do
pool <- newMacaPool 1
let guessConf = G.TrainConf guessConfDefault sgdArgs
disambConf = D.TrainConf tiersDefault disambConfDefault sgdArgs
ana = fmap (packSentTag tagset . concat) . macaPar pool . L.toStrict
C.train tagset ana guessNum guessConf disambConf
(map (packSentTagO tagset) train0)
(map (packSentTagO tagset) <$> eval0)