{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveGeneric #-} -- | A parser for the Wiki NER work presented in: -- -- @Article{nothman2012:artint:wikiner, -- author = {Joel Nothman and Nicky Ringland and Will Radford and Tara Murphy and James R. Curran}, -- title = {Learning multilingual named entity recognition from {Wikipedia}}, -- journal = {Artificial Intelligence}, -- publisher = {Elsevier}, -- volume = {194}, -- pages = {151--175}, -- year = {2012}, -- doi = {10.1016/j.artint.2012.03.006}, -- url = {http://dx.doi.org/10.1016/j.artint.2012.03.006} -- } -- -- And provided here: http://schwa.org/projects/resources/wiki/Wikiner -- -- The format does not appear to be documented, but it looks like: -- -- * One sentence per line. -- -- * Tagged tokens are separated by spaces -- -- * Items in a tagged token are separated by vertical bars ('|') -- -- * Each line of `n` text tokens contains 3*n items, starting with a -- text token, a POS tag, then a IOB tag with one of the NER classes -- -- For example, the sentence: -- The Oxford Companion to Philosophy says, "there is no single defining position that all anarchists hold, and those considered anarchists at best sharae a certain family resemblance." -- -- Is rendered as: -- The|DT|I-MISC Oxford|NNP|I-MISC Companion|NNP|I-MISC to|TO|I-MISC Philosophy|NNP|I-MISC says|VBZ|O ,|,|O "|LQU|O there|EX|O is|VBZ|O no|DT|O single|JJ|O defining|VBG|O position|NN|O that|IN|O all|DT|O anarchists|NNS|O hold|VBP|O ,|,|O and|CC|O those|DT|O considered|VBN|O anarchists|NNS|O at|IN|O best|JJS|O share|NN|O a|DT|O certain|JJ|O family|NN|O resemblance|NN|O .|.|O "|RQU|O -- -- -- This module also provides a trained model for NER via the averaged -- perceptron chunker. This actually kindof works, which is a bit -- amazing. For example: -- -- > import NLP.Corpora.WikiNer -- > import NLP.POS -- > import NLP.Chunk -- > tgr <- defaultTagger -- > chk <- wikiNerChunker -- > chunkText tgr chk "Real World Haskell is a book created by Don Stewart, Bryan O'Sullivan, and Jon Goerzen." -- > "[ORG Real/NNP] [MISC World/NNP] [PER Haskell/NNP] is/VBZ a/DT book/NN created/VBN by/IN [PER Don/NNP Stewart/NNP] ,/, [PER Bryan/NNP O'Sullivan/NNP] ,/, and/CC [PER Jon/NNP Goerzen/NNP] ./." -- -- module NLP.Corpora.WikiNer ( parseWikiNer , trainChunker , wikiNerChunker , Chunk(..) ) where import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.IO as T import Data.Serialize (Serialize) import GHC.Generics import System.FilePath ((</>)) import Text.Read (readEither) import Test.QuickCheck.Arbitrary (Arbitrary(..)) import Test.QuickCheck.Gen (elements) import NLP.Chunk (train, loadChunker) import NLP.Chunk.AvgPerceptronChunker (Chunker(..), mkChunker) import qualified NLP.Corpora.Conll as Conll import NLP.ML.AvgPerceptron ( emptyPerceptron ) import NLP.Types.IOB hiding (parseIOB) import NLP.Types.General (Error, toEitherErr) import NLP.Types.Tags import Paths_chatter parseWikiNer :: Text -> Either Error [[IOBChunk Chunk Conll.Tag]] parseWikiNer = parseIOB -- | Convert wikiNer format to basic IOB (one token perline, space -- separated tags, and a blank line between each sentence) parseIOB :: (ChunkTag chunk, Tag tag) => Text -> Either Error [[IOBChunk chunk tag]] parseIOB input = sequence $ map (parseSentence . toIOBLines) (filter (/="") $ T.lines input) -- | Different classes of Named Entity used in the WikiNER data set. data Chunk = LOC | MISC | ORG | PER | C_O -- ^ "out" not a chunk. deriving (Read, Show, Ord, Eq, Generic, Enum, Bounded) instance Arbitrary Chunk where arbitrary = elements [minBound ..] instance Serialize Chunk instance ChunkTag Chunk where fromChunk = T.pack . show parseChunk txt = toEitherErr $ readEither (T.unpack txt) notChunk = C_O wikiNerChunker :: IO (Chunker Chunk Conll.Tag) wikiNerChunker = do dir <- getDataDir loadChunker (dir </> "data" </> "models" </> "wikiner.ner.model.gz") -- | Tranlsate a WikiNER sentence into a list of IOB-lines, for -- parsing with `parseIOBLine` toIOBLines :: Text -> [Text] toIOBLines sent = map (T.replace "|" " ") (T.words sent) -- | Train a chunker on a provided corpus. trainChunker :: [FilePath] -> IO (Chunker Chunk Conll.Tag) trainChunker corpora = do content <- mapM T.readFile corpora let trainingText = T.intercalate "\n" content eiobs = parseWikiNer trainingText chunker :: Chunker Chunk Conll.Tag chunker = mkChunker emptyPerceptron case eiobs of Left err -> do T.putStrLn err error (T.unpack err) Right iobs -> do print (take 1 iobs) let chunkSents = map toChunkTree iobs train chunker chunkSents