src/NLP/Types/IOB.hs

{-# LANGUAGE OverloadedStrings #-}
module NLP.Types.IOB where

import Prelude hiding (print)
import Control.Applicative ((<$>), (<*>))
import Data.Maybe (mapMaybe)
import Data.Monoid ((<>))
import Data.Text (Text)
import qualified Data.Text as T

import Test.QuickCheck (Arbitrary(..), elements)
import Test.QuickCheck.Instances ()

import NLP.Types.Tags
import NLP.Types.Tree
import NLP.Types.General (Error)


-- TODO: This module needs to be rewritten to parse IOB represented
-- data as a tree, then once that tree is created, establish the
-- semantic types at the proper levels.
--
-- I think the levels should look something like this:
--
--  0: Tokens
--  1: POS Tags
--  2/3: Chunks
--  3/2: NER tags

-- | Data type to indicate IOB tags for chunking
data IOBChunk chunk tag = BChunk (POS tag) chunk -- ^ Beging marker.
                        | IChunk (POS tag) chunk -- ^ In chunk tag
                        | OChunk (POS tag) -- ^ Not in a chunk.
  deriving (Read, Show, Eq)

getPOS :: (ChunkTag c, Tag t) => IOBChunk c t -> POS t
getPOS (BChunk pos _) = pos
getPOS (IChunk pos _) = pos
getPOS (OChunk pos)   = pos

instance (ChunkTag c, Arbitrary c, Arbitrary t, Tag t) => Arbitrary (IOBChunk c t) where
  arbitrary = elements =<< do
                ic <- IChunk <$> arbitrary <*> arbitrary
                bc <- BChunk <$> arbitrary <*> arbitrary
                oc <- OChunk <$> arbitrary
                return [ic, bc, oc]

toTaggedSentence :: (ChunkTag c, Tag t) => [IOBChunk c t] -> TaggedSentence t
toTaggedSentence iobChunks = TaggedSent $ map getPOS iobChunks


-- | Parse an IOB-chunk encoded line of text.
--
-- Assumes that the line has three space-delimeted entries, in the format:
-- > token POSTag IOBChunk
-- For example:
-- > > parseIOBLine "We PRP B-NP" :: IOBChunk B.Chunk B.Tag
-- > BChunk (POS B.PRP (Token "We")) B.C_NP
--
parseIOBLine :: (ChunkTag chunk, Tag tag) => Text -> Either Error (IOBChunk chunk tag)
parseIOBLine txt =
  case T.words txt of
    (tokTxt:tagTxt:iobTxt:_) ->
      let token = Token tokTxt
          tag   = POS (parseTag tagTxt) token
      in iobBuilder iobTxt tag
    _ -> Left ("not enough words in IOB line: \"" <> txt <> "\"")

iobBuilder :: (ChunkTag c, Tag t) => Text -> (POS t -> Either Error (IOBChunk c t))
iobBuilder iobTxt | "I-" `T.isPrefixOf` iobTxt = \tag -> (IChunk tag) <$> chunk
                  | "B-" `T.isPrefixOf` iobTxt = \tag -> (BChunk tag) <$> chunk
                  | otherwise                  = \tag -> Right (OChunk tag)
  where
    chunk = parseChunk (T.drop 2 iobTxt)


-- | Turn an IOB result into a tree.
toChunkTree :: (ChunkTag c, Tag t) => [IOBChunk c t] -> ChunkedSentence c t
toChunkTree chunks = ChunkedSent $ toChunkOr chunks

  where
    toChunkOr :: (ChunkTag c, Tag t) => [IOBChunk c t] -> [ChunkOr c t]
    toChunkOr [] = []
    toChunkOr ((OChunk pos):rest)       = POS_CN pos : toChunkOr rest
    toChunkOr (ch:rest) = case ch of
      (BChunk pos chunk) -> (Chunk_CN (Chunk chunk children)) : toChunkOr theTail
      (IChunk pos chunk) -> (Chunk_CN (Chunk chunk children)) : toChunkOr theTail
      where
        (ichunks, theTail) = span isIChunk rest

        toPOScn (IChunk pos _) = Just $ POS_CN pos
        toPOScn _              = Nothing

    --    children :: [ChunkOr c t]
        children = mapMaybe toPOScn ichunks

        isIChunk (IChunk _ _) = True
        isIChunk _            = False

-- | Parse an IOB-encoded corpus.
parseIOB :: (ChunkTag chunk, Tag tag) => Text -> Either Error [[IOBChunk chunk tag]]
parseIOB corpora =
  let sentences = getSentences corpora
  in sequence $ map parseSentence sentences

parseSentence :: (ChunkTag chunk, Tag tag) => [Text] -> Either Error [IOBChunk chunk tag]
parseSentence input = sequence (map parseIOBLine input)

-- | Just split a body of text into lines, and then into "paragraphs".
-- Each resulting sub list is separated by empty lines in the original text.
--
-- e.g.;
-- > > getSentences "He\njumped\n.\n\nShe\njumped\n."
-- > [["He", "jumped", "."], ["She","jumped", "."]]
--
getSentences :: Text -> [[Text]]
getSentences corpora =
  let theLines = map T.strip $ T.lines corpora

      sentences :: [Text] -> [[Text]]
      sentences []      = []
      sentences ("":xs) = sentences xs
      sentences input   = let (sent, rest) = break (== T.empty) input
                          in (sent:sentences rest)

  in sentences theLines