{-# LANGUAGE RecordWildCards #-} {-# LANGUAGE OverloadedStrings #-} -- | Observation schema blocks for Nerf. module NLP.Nerf.Schema ( -- * Schema Ox , Schema , void , sequenceS_ -- * Using the schema , schematize -- * Building schema -- ** From config , SchemaCfg (..) , defaultCfg , fromCfg -- ** Schema blocks , Block , fromBlock , orthS , lemmaS , shapeS , shapePairS , suffixS , searchS ) where import Control.Applicative ((<$>), (<*>)) import Control.Monad (forM_, join) import Data.Maybe (maybeToList) import Data.Binary (Binary, put, get, decodeFile) import qualified Data.Char as C import qualified Data.Set as S import qualified Data.Map as M import qualified Data.Vector as V import qualified Data.Text as T import qualified Data.CRF.Chain1 as CRF import qualified Control.Monad.Ox as Ox import qualified Control.Monad.Ox.Text as Ox import NLP.Nerf.Types import qualified NLP.Nerf.Dict as Dict -- | The Ox monad specialized to word token type and text observations. type Ox a = Ox.Ox Word T.Text a -- | A schema is a block of the Ox computation performed within the -- context of the sentence and the absolute sentence position. type Schema a = V.Vector Word -> Int -> Ox a -- | A dummy schema block. void :: a -> Schema a void x _ _ = return x -- | Sequence the list of schemas and discard individual values. sequenceS_ :: [Schema a] -> Schema () sequenceS_ xs sent = let ys = map ($sent) xs in \k -> sequence_ (map ($k) ys) -- | Record structure of the basic observation types. data BaseOb = BaseOb { orth :: Int -> Maybe T.Text , lowOrth :: Int -> Maybe T.Text } -- | Construct the 'BaseOb' structure given the sentence. mkBaseOb :: V.Vector Word -> BaseOb mkBaseOb sent = BaseOb { orth = _orth , lowOrth = _lowOrth } where at = Ox.atWith sent _orth = (id `at`) _lowOrth i = T.toLower <$> _orth i -- | A block is a chunk of the Ox computation performed within the -- context of the sentence and the list of absolute sentence positions. type Block a = V.Vector Word -> [Int] -> Ox a -- | Transform the block to the schema dependent on the list of -- relative sentence positions. fromBlock :: Block a -> [Int] -> Schema a fromBlock blk xs sent = let blkSent = blk sent in \k -> blkSent [x + k | x <- xs] -- | Orthographic observations determined with respect to the -- list of relative positions. orthS :: Block () orthS sent = \ks -> do mapM_ (Ox.save . lowOrth) ks mapM_ (Ox.save . upOnlyOrth) ks where BaseOb{..} = mkBaseOb sent upOnlyOrth i = orth i >>= \x -> case T.any C.isUpper x of True -> Just x False -> Nothing -- | Lemma substitute determined with respect to the list of -- relative positions. lemmaS :: Block () lemmaS sent = \ks -> do mapM_ lowLemma ks where BaseOb{..} = mkBaseOb sent lowPrefix i j = Ox.prefix j =<< lowOrth i lowSuffix i j = Ox.suffix j =<< lowOrth i lowLemma i = Ox.group $ do mapM_ (Ox.save . lowPrefix i) [0, -1, -2, -3] mapM_ (Ox.save . lowSuffix i) [0, -1, -2, -3] -- | Shape and packed shape determined with respect to the list of -- relative positions. shapeS :: Block () shapeS sent = \ks -> do mapM_ (Ox.save . shape) ks mapM_ (Ox.save . shapeP) ks where BaseOb{..} = mkBaseOb sent shape i = Ox.shape <$> orth i shapeP i = Ox.pack <$> shape i -- | Shape pairs determined with respect to the list of relative positions. shapePairS :: Block () shapePairS sent = \ks -> forM_ ks $ \i -> do Ox.save $ link <$> shape i <*> shape (i - 1) Ox.save $ link <$> shapeP i <*> shapeP (i - 1) where BaseOb{..} = mkBaseOb sent shape i = Ox.shape <$> orth i shapeP i = Ox.pack <$> shape i link x y = T.concat [x, "-", y] -- | Several suffixes determined with respect to the list of -- relative positions. suffixS :: Block () suffixS sent = \ks -> forM_ ks $ \i -> mapM_ (Ox.save . lowSuffix i) [2, 3, 4] where BaseOb{..} = mkBaseOb sent lowSuffix i j = Ox.suffix j =<< lowOrth i -- | Plain dictionary search determined with respect to the list of -- relative positions. searchS :: Dict.NeDict -> Block () searchS dict sent = \ks -> do mapM_ (Ox.saves . searchDict) ks where BaseOb{..} = mkBaseOb sent searchDict i = join . maybeToList $ S.toList <$> (orth i >>= flip M.lookup dict) -- | Configuration of the schema. All configuration elements specify the -- range over which a particular observation type should be taken on account. -- For example, the @[-1, 0, 2]@ range means that observations of particular -- type will be extracted with respect to previous (@k - 1@), current (@k@) -- and after the next (@k + 2@) positions when identifying the observation -- set for position @k@ in the input sentence. data SchemaCfg = SchemaCfg { orthC :: [Int] -- ^ The 'orthS' schema block , lemmaC :: [Int] -- ^ The 'lemmaS' schema block , shapeC :: [Int] -- ^ The 'shapeS' schema block , shapePairC :: [Int] -- ^ The 'shapePairS' schema block , suffixC :: [Int] -- ^ The 'suffixS' schema block , dictC :: Maybe (Dict.NeDict, [Int]) -- ^ The 'searchS' schema block } instance Binary SchemaCfg where put SchemaCfg{..} = do put orthC put lemmaC put shapeC put shapePairC put suffixC put dictC get = SchemaCfg <$> get <*> get <*> get <*> get <*> get <*> get -- | Default configuration for Nerf observation schema. defaultCfg :: FilePath -- ^ Path to 'Dict.NeDict' in a binary form -> IO SchemaCfg defaultCfg nePath = do neDict <- decodeFile nePath return $ SchemaCfg { orthC = [-1, 0] , lemmaC = [-1, 0] , shapeC = [-1, 0] , shapePairC = [0] , suffixC = [0] , dictC = Just (neDict, [-1, 0]) } mkBasicS :: Block () -> [Int] -> Schema () mkBasicS _ [] = void () mkBasicS blk xs = fromBlock blk xs mkDictS :: Maybe (Dict.NeDict, [Int]) -> Schema () mkDictS (Just (d, xs)) = fromBlock (searchS d) xs mkDictS Nothing = void () -- | Build the schema based on the configuration. fromCfg :: SchemaCfg -> Schema () fromCfg SchemaCfg{..} = sequenceS_ [ mkBasicS orthS orthC , mkBasicS lemmaS lemmaC , mkBasicS shapeS shapeC , mkBasicS shapePairS shapePairC , mkBasicS suffixS suffixC , mkDictS dictC ] -- | Use the schema to extract observations from the sentence. schematize :: Schema a -> [Word] -> CRF.Sent Ob schematize schema xs = map (S.fromList . Ox.execOx . schema v) [0 .. n - 1] where v = V.fromList xs n = V.length v