{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DuplicateRecordFields #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE ScopedTypeVariables #-} module Text.CoreNLP.Types ( Dependency(..) , Entitymention(..) , Token(..) , Sentence(..) , PennPOS(..) , Coref(..) , CorefsId , Corefs , Document(..) , NamedEntity(..) ) where import Control.Applicative import qualified Data.Aeson as J import Data.Aeson (FromJSON, ToJSON, (.:), (.:?)) import Data.Map (Map) import Data.Maybe (fromMaybe) import Data.Text (Text) import GHC.Generics (Generic) -- | Contains generic "hacks" we put on top to go around limitations like a -- "type" keyword we can't use jsonOpts :: J.Options jsonOpts = J.defaultOptions {J.fieldLabelModifier = fm} where fm :: String -> String fm "type_" = "type" fm x = x data Dependency = Dependency { dep :: Text , governor :: Int , governorGloss :: Text , dependent :: Int , dependentGloss :: Text } deriving (Show, Eq, Generic) instance FromJSON Dependency where parseJSON = J.genericParseJSON jsonOpts instance ToJSON Dependency where toJSON = J.genericToJSON jsonOpts data Entitymention = Entitymention { docTokenBegin :: Int , docTokenEnd :: Int , tokenBegin :: Int , tokenEnd :: Int , text :: Text , characterOffsetBegin :: Int , characterOffsetEnd :: Int , ner :: Text , normalizedNER :: Maybe Text } deriving (Show, Eq, Generic) instance FromJSON Entitymention where parseJSON = J.genericParseJSON jsonOpts instance ToJSON Entitymention where toJSON = J.genericToJSON jsonOpts data PennPOS = CC -- ^ Coordinating conjunction | CD -- ^ Cardinal number | DT -- ^ Determiner | EX -- ^ Existential *there* | FW -- ^ Foreign word | IN -- ^ Preposition or subordinating conjunction | JJ -- ^ Adjective | JJR -- ^ Adjective, comparative | JJS -- ^ Adjective, superlative | LS -- ^ List item marker | MD -- ^ Modal | NN -- ^ Noun, singular or mass | NNS -- ^ Noun, plural | NNP -- ^ Proper noun, singular | NNPS -- ^ Proper noun, plural | PDT -- ^ Predeterminer | POS -- ^ Possessive ending | PRP -- ^ Personal pronoun | PRPDollar -- ^ Possessive pronoun | RB -- ^ Adverb | RBR -- ^ Adverb, comparative | RBS -- ^ Adverb, superlative | RP -- ^ Particle | SYM -- ^ Symbol | TO -- ^ *to* | UH -- ^ Interjection | VB -- ^ Verb, base form | VBD -- ^ Verb, past tense | VBG -- ^ Verb, gerund or present participle | VBN -- ^ Verb, past participle | VBP -- ^ Verb, non-3rd person singular present | VBZ -- ^ Verb, 3rd person singular present | WDT -- ^ Wh-determiner | WP -- ^ Wh-pronoun | WPDollar -- ^ Possessive wh-pronoun | WRB -- ^ Wh-adverb | LRB -- ^ "-LRB-"? No idea what's this | RRB -- ^ "-RRB-"? No idea what's this | PosPunctuation -- ^ anyOf ".:,''$#$,", sometimes few together deriving (Show, Eq, Generic) instance FromJSON PennPOS where parseJSON (J.String "WP$") = pure WPDollar parseJSON (J.String "PRP$") = pure PRPDollar parseJSON (J.String "-LRB-") = pure LRB parseJSON (J.String "-RRB-") = pure RRB parseJSON x = J.genericParseJSON jsonOpts x <|> parsePunctuation x where parsePunctuation (J.String _) = pure PosPunctuation parsePunctuation _ = fail "Expecting POS to be a String" instance ToJSON PennPOS where toJSON WPDollar = J.String "WP$" toJSON PRPDollar = J.String "PRP$" toJSON PosPunctuation = J.String "." toJSON LRB = J.String "-LRB-" toJSON RRB = J.String "-RRB-" toJSON x = J.genericToJSON jsonOpts x -- | See https:\/\/stanfordnlp.github.io\/CoreNLP\/ner.html data NamedEntity = PERSON | LOCATION | ORGANIZATION | MISC | MONEY | NUMBER | ORDINAL | PERCENT | DATE | TIME | DURATION | SET | EMAIL | URL | CITY | STATE_OR_PROVINCE | COUNTRY | NATIONALITY | RELIGION | TITLE -- ^ Job title | IDEOLOGY | CRIMINAL_CHARGE | CAUSE_OF_DEATH | O -- ^ Not a named entity? TODO: check somehow deriving (Show, Eq, Generic) instance FromJSON NamedEntity where parseJSON = J.genericParseJSON jsonOpts instance ToJSON NamedEntity where toJSON = J.genericToJSON jsonOpts data Token = Token { index :: Int , word :: Text , originalText :: Text , lemma :: Text , characterOffsetBegin :: Int , characterOffsetEnd :: Int , pos :: PennPOS , ner :: NamedEntity , speaker :: Maybe Text , before :: Text , after :: Text } deriving (Show, Eq, Generic) instance FromJSON Token where parseJSON = J.genericParseJSON jsonOpts instance ToJSON Token where toJSON = J.genericToJSON jsonOpts data Sentence = Sentence { index :: Int , parse :: Maybe Text , basicDependencies :: [Dependency] , enhancedDependencies :: [Dependency] , enhancedPlusPlusDependencies :: [Dependency] , entitymentions :: [Entitymention] , tokens :: [Token] } deriving (Show, Eq, Generic) instance FromJSON Sentence where parseJSON = J.withObject "sentence" $ \s -> do index <- s .: "index" parse <- s .:? "parse" basicDependencies <- fromMaybe [] <$> s .:? "basicDependencies" enhancedDependencies <- fromMaybe [] <$> s .:? "enhancedDependencies" enhancedPlusPlusDependencies <- fromMaybe [] <$> s .:? "enhancedPlusPlusDependencies" entitymentions <- fromMaybe [] <$> s .:? "entitymentions" tokens <- fromMaybe [] <$> s .:? "tokens" pure $ Sentence { .. } instance ToJSON Sentence where toJSON = J.genericToJSON jsonOpts data Coref = Coref { id :: Int , text :: Text , type_ :: Text , number :: Text , gender :: Text , animacy :: Text , startIndex :: Int , endIndex :: Int , headIndex :: Int , sentNum :: Int , position :: [Int] , isRepresentativeMention :: Bool } deriving (Show, Eq, Generic) instance FromJSON Coref where parseJSON = J.genericParseJSON jsonOpts instance ToJSON Coref where toJSON = J.genericToJSON jsonOpts type CorefsId = Text type Corefs = Map CorefsId [Coref] data Document = Document { docId :: Maybe Text , sentences :: [Sentence] , corefs :: Maybe Corefs } deriving (Show, Eq, Generic) instance FromJSON Document where parseJSON = J.genericParseJSON jsonOpts instance ToJSON Document where toJSON = J.genericToJSON jsonOpts