src/NLP/Corpora/WikiNer.hs

{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
-- | A parser for the Wiki NER work presented in:
--
-- @Article{nothman2012:artint:wikiner,
--   author = {Joel Nothman and Nicky Ringland and Will Radford and Tara Murphy and James R. Curran},
--   title = {Learning multilingual named entity recognition from {Wikipedia}},
--   journal = {Artificial Intelligence},
--   publisher = {Elsevier},
--   volume = {194},
--   pages = {151--175},
--   year = {2012},
--   doi = {10.1016/j.artint.2012.03.006},
--   url = {http://dx.doi.org/10.1016/j.artint.2012.03.006}
-- }
--
-- And provided here: http://schwa.org/projects/resources/wiki/Wikiner
--
-- The format does not appear to be documented, but it looks like:
--
--  * One sentence per line.
--
--  * Tagged tokens are separated by spaces
--
--  * Items in a tagged token are separated by vertical bars ('|')
--
--  * Each line of `n` text tokens contains 3*n items, starting with a
--  text token, a POS tag, then a IOB tag with one of the NER classes
--
-- For example, the sentence:
--   The Oxford Companion to Philosophy says, "there is no single defining position that all anarchists hold, and those considered anarchists at best sharae a certain family resemblance."
--
-- Is rendered as:
--  The|DT|I-MISC Oxford|NNP|I-MISC Companion|NNP|I-MISC to|TO|I-MISC Philosophy|NNP|I-MISC says|VBZ|O ,|,|O "|LQU|O there|EX|O is|VBZ|O no|DT|O single|JJ|O defining|VBG|O position|NN|O that|IN|O all|DT|O anarchists|NNS|O hold|VBP|O ,|,|O and|CC|O those|DT|O considered|VBN|O anarchists|NNS|O at|IN|O best|JJS|O share|NN|O a|DT|O certain|JJ|O family|NN|O resemblance|NN|O .|.|O "|RQU|O
--
--
--  This module also provides a trained model for NER via the averaged
--  perceptron chunker.  This actually kindof works, which is a bit
--  amazing.  For example:
--
-- > import NLP.Corpora.WikiNer
-- > import NLP.POS
-- > import NLP.Chunk
-- > tgr <- defaultTagger
-- > chk <- wikiNerChunker
-- > chunkText tgr chk "Real World Haskell is a book created by Don Stewart, Bryan O'Sullivan, and Jon Goerzen."
-- > "[ORG Real/NNP] [MISC World/NNP] [PER Haskell/NNP] is/VBZ a/DT book/NN created/VBN by/IN [PER Don/NNP Stewart/NNP] ,/, [PER Bryan/NNP O'Sullivan/NNP] ,/, and/CC [PER Jon/NNP Goerzen/NNP] ./."
--
--
module NLP.Corpora.WikiNer
  ( parseWikiNer
  , trainChunker
  , wikiNerChunker
  , Chunk(..)
  )
where

import           Data.Text                      (Text)
import qualified Data.Text as T
import qualified Data.Text.IO as T
import           Data.Serialize                 (Serialize)
import           GHC.Generics
import           System.FilePath                ((</>))
import           Text.Read                      (readEither)
import           Test.QuickCheck.Arbitrary      (Arbitrary(..))
import           Test.QuickCheck.Gen            (elements)


import           NLP.Chunk                      (train, loadChunker)
import           NLP.Chunk.AvgPerceptronChunker (Chunker(..), mkChunker)
import qualified NLP.Corpora.Conll as Conll
import           NLP.ML.AvgPerceptron           ( emptyPerceptron )
import           NLP.Types.IOB hiding           (parseIOB)
import           NLP.Types.General              (Error, toEitherErr)
import           NLP.Types.Tags

import           Paths_chatter

parseWikiNer :: Text -> Either Error [[IOBChunk Chunk Conll.Tag]]
parseWikiNer = parseIOB

-- | Convert wikiNer format to basic IOB (one token perline, space
-- separated tags, and a blank line between each sentence)
parseIOB :: (ChunkTag chunk, Tag tag) => Text -> Either Error [[IOBChunk chunk tag]]
parseIOB input = sequence $ map (parseSentence . toIOBLines) (filter (/="") $ T.lines input)

-- | Different classes of Named Entity used in the WikiNER data set.
data Chunk = LOC
           | MISC
           | ORG
           | PER
           | C_O -- ^ "out" not a chunk.
             deriving (Read, Show, Ord, Eq, Generic, Enum, Bounded)


instance Arbitrary Chunk where
  arbitrary = elements [minBound ..]

instance Serialize Chunk

instance ChunkTag Chunk where
  fromChunk = T.pack . show
  parseChunk txt = toEitherErr $ readEither (T.unpack txt)
  notChunk = C_O

wikiNerChunker :: IO (Chunker Chunk Conll.Tag)
wikiNerChunker = do
  dir <- getDataDir
  loadChunker (dir </> "data" </> "models" </> "wikiner.ner.model.gz")

-- | Tranlsate a WikiNER sentence into a list of IOB-lines, for
-- parsing with `parseIOBLine`
toIOBLines :: Text -> [Text]
toIOBLines sent = map (T.replace "|" " ") (T.words sent)

-- | Train a chunker on a provided corpus.
trainChunker :: [FilePath] -> IO (Chunker Chunk Conll.Tag)
trainChunker corpora = do
  content <- mapM T.readFile corpora

  let trainingText = T.intercalate "\n" content

      eiobs = parseWikiNer trainingText

      chunker :: Chunker Chunk Conll.Tag
      chunker = mkChunker emptyPerceptron

  case eiobs of
    Left   err -> do
      T.putStrLn err
      error (T.unpack err)
    Right iobs -> do
      print (take 1 iobs)
      let chunkSents = map toChunkTree iobs
      train chunker chunkSents