module NLP.Concraft.Croatian.Morphosyntax
(
packSent
, packSentT
, addAnalysis
, extractSentences
, transformToConfig
, Word(..)
, ListLike(..)
) where
import Data.Aeson (FromJSON, ToJSON)
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Tagset.Positional as P
import GHC.Generics
import Data.String (IsString)
import Data.Binary (Binary(..))
import qualified NLP.Concraft.Morphosyntax as X
import NLP.Concraft.Morphosyntax (Seg(..))
import qualified Data.Map as M
import qualified Data.Set as S
data Word = Word {
orth :: T.Text
, oov :: Bool
}
deriving (Show,Generic, Eq, Ord)
instance X.Word Word where
orth = orth
oov = oov
instance Binary Word
instance FromJSON Word
instance ToJSON Word
instance (Binary a, Binary b) => Binary (Seg a b) where
put (Seg x ts) = put x >> put ts
get = do
x <- get
y <- get
return $ Seg x y
class (Data.String.IsString a) => ListLike a where
tcintersperse :: Char -> a -> a
tcmap :: (Char -> Char) -> a -> a
strict :: a -> T.Text
tcwords :: a -> [a]
tcsplitOn :: a -> a -> [a]
tcnull :: a -> Bool
tclines :: a -> [a]
instance ListLike T.Text where
tcintersperse = T.intersperse
tcmap = T.map
strict = id
tcwords = T.words
tcsplitOn = T.splitOn
tcnull = T.null
tclines = T.lines
instance ListLike L.Text where
tcintersperse = L.intersperse
tcmap = L.map
strict = L.toStrict
tcwords = L.words
tcsplitOn = L.splitOn
tcnull = L.null
tclines = L.lines
transformToConfig :: ListLike a => a -> a
transformToConfig = tcintersperse ':' . tcmap toNine
where toNine x = if x `elem` "=-" then '9' else x
addAnalysis :: X.Sent Word P.Tag -> [S.Set P.Tag] -> X.Sent Word P.Tag
addAnalysis = zipWith f
where f seg tgs = seg { word = (word seg) { oov = M.null wtagsMap }
, tags = X.mkWMap . M.toList $ unionWP}
where wtagsMap = M.fromList $ zip (S.toList tgs) [0,0..]
ptagsMap = X.unWMap . tags $ seg
unionWP = M.unionWith const ptagsMap wtagsMap
packSent :: ListLike a => P.Tagset -> [a] -> X.Sent Word P.Tag
packSent = packSentP [0.0,0.0..]
packSentT :: ListLike a => P.Tagset -> [a] -> X.Sent Word P.Tag
packSentT = packSentP $ 1.0 : [0.0,0.0..]
packSentP :: ListLike a => [Double] -> P.Tagset -> [a] -> X.Sent Word P.Tag
packSentP dist tset = map (packSegP tset dist)
packSegP :: ListLike a => P.Tagset -> [Double] -> a -> X.Seg Word P.Tag
packSegP tset dist xs = X.Seg {word=mywrd, tags=wmap}
where (w:tagxs) = tcwords xs
wmap = X.mkWMap $ zip rtags dist
rtags = map (P.parseTag tset . strict . transformToConfig) tagxs
mywrd = Word {orth=strict w, oov=null rtags}
extractSentences :: ListLike a => a -> [[a]]
extractSentences =
map tclines . filter (not . tcnull) . tcsplitOn "\n\n"