module NLP.Concraft.Polish.Morphosyntax
(
Tag
, Seg (..)
, Word (..)
, Interp (..)
, Space (..)
, select
, Sent
, SentO (..)
, restore
, withOrig
, packSeg
, packSent
, packSentO
) where
import Control.Applicative ((<$>), (<*>))
import Control.Arrow (first)
import Data.Maybe (catMaybes)
import Data.Aeson
import Data.Binary (Binary, put, get, putWord8, getWord8)
import qualified Data.Aeson as Aeson
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Tagset.Positional as P
import qualified NLP.Concraft.Morphosyntax as X
type Tag = T.Text
data Seg t = Seg
{ word :: Word
, interps :: M.Map (Interp t) Bool }
deriving (Show, Eq, Ord)
instance (Ord t, Binary t) => Binary (Seg t) where
put Seg{..} = put word >> put interps
get = Seg <$> get <*> get
data Word = Word
{ orth :: T.Text
, space :: Space
, known :: Bool }
deriving (Show, Eq, Ord)
instance X.Word Word where
orth = orth
oov = not.known
instance ToJSON Word where
toJSON Word{..} = object
[ "orth" .= orth
, "space" .= space
, "known" .= known ]
instance FromJSON Word where
parseJSON (Object v) = Word
<$> v .: "orth"
<*> v .: "space"
<*> v .: "known"
parseJSON _ = error "parseJSON [Word]"
instance Binary Word where
put Word{..} = put orth >> put space >> put known
get = Word <$> get <*> get <*> get
data Interp t = Interp
{ base :: Maybe T.Text
, tag :: t }
deriving (Show, Eq, Ord)
instance (Ord t, Binary t) => Binary (Interp t) where
put Interp{..} = put base >> put tag
get = Interp <$> get <*> get
data Space
= None
| Space
| NewLine
deriving (Show, Eq, Ord)
instance Binary Space where
put x = case x of
None -> putWord8 1
Space -> putWord8 2
NewLine -> putWord8 3
get = getWord8 >>= \x -> return $ case x of
1 -> None
2 -> Space
_ -> NewLine
instance ToJSON Space where
toJSON x = Aeson.String $ case x of
None -> "none"
Space -> "space"
NewLine -> "newline"
instance FromJSON Space where
parseJSON (Aeson.String x) = return $ case x of
"none" -> None
"space" -> Space
"newline" -> NewLine
_ -> error "parseJSON [Space]"
parseJSON _ = error "parseJSON [Space]"
select :: Ord a => a -> Seg a -> Seg a
select x = selectWMap (X.mkWMap [(x, 1)])
selectWMap :: Ord a => X.WMap a -> Seg a -> Seg a
selectWMap wMap seg =
seg { interps = newInterps }
where
wSet = M.fromList . map (first tag) . M.toList . interps
asDmb x = if x > 0
then True
else False
newInterps = M.fromList $
[ case M.lookup (tag interp) (X.unWMap wMap) of
Just x -> (interp, asDmb x)
Nothing -> (interp, False)
| interp <- M.keys (interps seg) ]
++ catMaybes
[ if tag `M.member` wSet seg
then Nothing
else Just (Interp Nothing tag, asDmb x)
| (tag, x) <- M.toList (X.unWMap wMap) ]
type Sent t = [Seg t]
data SentO t = SentO
{ segs :: [Seg t]
, orig :: L.Text }
restore :: Sent t -> L.Text
restore =
let wordStr Word{..} = [spaceStr space, orth]
spaceStr None = ""
spaceStr Space = " "
spaceStr NewLine = "\n"
in L.fromChunks . concatMap (wordStr . word)
withOrig :: Sent t -> SentO t
withOrig s = SentO
{ segs = s
, orig = restore s }
packSeg_ :: Ord a => Seg a -> X.Seg Word a
packSeg_ Seg{..} = X.Seg word $ X.mkWMap
[ (tag x, if disamb then 1 else 0)
| (x, disamb) <- M.toList interps ]
packSeg :: P.Tagset -> Seg Tag -> X.Seg Word P.Tag
packSeg tagset = X.mapSeg (P.parseTag tagset) . packSeg_
packSent :: P.Tagset -> Sent Tag -> X.Sent Word P.Tag
packSent = map . packSeg
packSentO :: P.Tagset -> SentO Tag -> X.SentO Word P.Tag
packSentO tagset s = X.SentO
{ segs = packSent tagset (segs s)
, orig = orig s }