module NLP.Types.IOB where
import Prelude hiding (print)
import Control.Applicative ((<$>), (<*>))
import Data.Maybe (mapMaybe)
import Data.Monoid ((<>))
import Data.Text (Text)
import qualified Data.Text as T
import Test.QuickCheck (Arbitrary(..), elements)
import Test.QuickCheck.Instances ()
import NLP.Types.Tags
import NLP.Types.Tree
import NLP.Types.General (Error)
data IOBChunk chunk tag = BChunk (POS tag) chunk
| IChunk (POS tag) chunk
| OChunk (POS tag)
deriving (Read, Show, Eq)
getPOS :: (ChunkTag c, Tag t) => IOBChunk c t -> POS t
getPOS (BChunk pos _) = pos
getPOS (IChunk pos _) = pos
getPOS (OChunk pos) = pos
instance (ChunkTag c, Arbitrary c, Arbitrary t, Tag t) => Arbitrary (IOBChunk c t) where
arbitrary = elements =<< do
ic <- IChunk <$> arbitrary <*> arbitrary
bc <- BChunk <$> arbitrary <*> arbitrary
oc <- OChunk <$> arbitrary
return [ic, bc, oc]
toTaggedSentence :: (ChunkTag c, Tag t) => [IOBChunk c t] -> TaggedSentence t
toTaggedSentence iobChunks = TaggedSent $ map getPOS iobChunks
parseIOBLine :: (ChunkTag chunk, Tag tag) => Text -> Either Error (IOBChunk chunk tag)
parseIOBLine txt =
case T.words txt of
(tokTxt:tagTxt:iobTxt:_) ->
let token = Token tokTxt
tag = POS (parseTag tagTxt) token
in iobBuilder iobTxt tag
_ -> Left ("not enough words in IOB line: \"" <> txt <> "\"")
iobBuilder :: (ChunkTag c, Tag t) => Text -> (POS t -> Either Error (IOBChunk c t))
iobBuilder iobTxt | "I-" `T.isPrefixOf` iobTxt = \tag -> (IChunk tag) <$> chunk
| "B-" `T.isPrefixOf` iobTxt = \tag -> (BChunk tag) <$> chunk
| otherwise = \tag -> Right (OChunk tag)
where
chunk = parseChunk (T.drop 2 iobTxt)
toChunkTree :: (ChunkTag c, Tag t) => [IOBChunk c t] -> ChunkedSentence c t
toChunkTree chunks = ChunkedSent $ toChunkOr chunks
where
toChunkOr :: (ChunkTag c, Tag t) => [IOBChunk c t] -> [ChunkOr c t]
toChunkOr [] = []
toChunkOr ((OChunk pos):rest) = POS_CN pos : toChunkOr rest
toChunkOr (ch:rest) = case ch of
(BChunk pos chunk) -> (Chunk_CN (Chunk chunk children)) : toChunkOr theTail
(IChunk pos chunk) -> (Chunk_CN (Chunk chunk children)) : toChunkOr theTail
where
(ichunks, theTail) = span isIChunk rest
toPOScn (IChunk pos _) = Just $ POS_CN pos
toPOScn _ = Nothing
children = mapMaybe toPOScn ichunks
isIChunk (IChunk _ _) = True
isIChunk _ = False
parseIOB :: (ChunkTag chunk, Tag tag) => Text -> Either Error [[IOBChunk chunk tag]]
parseIOB corpora =
let sentences = getSentences corpora
in sequence $ map parseSentence sentences
parseSentence :: (ChunkTag chunk, Tag tag) => [Text] -> Either Error [IOBChunk chunk tag]
parseSentence input = sequence (map parseIOBLine input)
getSentences :: Text -> [[Text]]
getSentences corpora =
let theLines = map T.strip $ T.lines corpora
sentences :: [Text] -> [[Text]]
sentences [] = []
sentences ("":xs) = sentences xs
sentences input = let (sent, rest) = break (== T.empty) input
in (sent:sentences rest)
in sentences theLines