{-# language DeriveGeneric #-} {-# language OverloadedStrings #-} {-# OPTIONS_GHC -Wno-unused-top-binds #-} {- | Module : GDELT.V2.GKG Description : Parsers for GDELT Global Knowledge Graph (GKG) Version 2.1 Copyright : (c) Marco Zocca 2020 License : MIT Maintainer : ocramz Stability : experimental Portability : portable Data : http://data.gdeltproject.org/gkg/index.html (2013 until present) Codebook : http://data.gdeltproject.org/documentation/GDELT-Global_Knowledge_Graph_Codebook-V2.1.pdf This codebook introduces the GDELT Global Knowledge Graph (GKG) Version 2.1, which expands GDELT’s ability to quantify global human society beyond cataloging physical occurrences towards actually representing all of the latent dimensions, geography, and network structure of the global news. It applies an array of highly sophisticated natural language processing algorithms to each document to compute a range of codified metadata encoding key latent and contextual dimensions of the document. To sum up the GKG in a single sentence, it connects every person, organization, location, count, theme, news source, and event across the planet into a single massive network that captures what’s happening around the world, what its context is and who’s involved, and how the world is feeling about it, every single day. -} module GDELT.V2.GKG ( -- * RecordId RecordId(..), recordId -- * V2.1Date , v21date -- * SourceCollectionIdentifier , SourceCollectionIdentifier(..), sourceCollectionIdentifier -- * Counts , CountTy(..), countTy -- ** V1Counts , CountsV1(..) -- ** V2.1Counts , CountsV21(..), countsV21 -- * LocationV1 , LocationV1(..), locationV1, LocationTy(..) , ) where import Control.Applicative (Alternative(..)) import Data.Functor (($>), void) import GHC.Generics (Generic(..)) -- megaparsec import Text.Megaparsec (count, option, optional) import Text.Megaparsec.Char (string, char, letterChar, spaceChar) import Text.Megaparsec.Char.Lexer (decimal) -- text import Data.Text (Text, pack) -- time import Data.Time.LocalTime (LocalTime(..)) -- import Data.Time.Calendar (Day, fromGregorian) import GDELT.V2.Parsec.Common (Parser, localTime, digit, hash, ADM1(..), adm1, Latitude(..), latitude, Longitude(..), longitude) -- import GDELT.V2.GKG.Themes (ThemesV1(..), themesV1) -- | GKGRECORDID -- -- (string) Each GKG record is assigned a globally unique identifier.Unlike the EVENT system, which uses semi-sequential numbering to assign numeric IDs to each event record, the GKG system uses a date-oriented serial number. Each GKG record ID takes the form “YYYYMMDDHHMMSS-X” or “YYYYMMDDHHMMSS-TX” in which the first portion of the ID is the full date+time of the 15 minute update batch that this record was created in, followed by a dash, followed by sequential numbering for all GKG records created as part of that update batch. Records originating from a document that was translated by GDELT Translingual will have a capital “T” appearing immediately after the dash to allow filtering of English/non-English material simply by its record identifier. Thus, the fifth GKG record created as part of the update batch generated at 3:30AM on February 3, 2015 would have a GKGRECORDID of “20150203033000-5”and if it was based on a French-language document that was translated, it would have the ID "20150203033000-T5". This ID can be used to uniquely identify this particular record across the entire GKG database. data RecordId = RecordId { riDate :: LocalTime , riTranslingual :: Bool , riSeqNo :: Int } deriving (Eq, Show, Generic) -- | Parser for a 'RecordId' recordId :: Parser RecordId recordId = do t <- localTime void $ char '-' tl <- translingual i <- decimal pure $ RecordId t tl i translingual :: Parser Bool translingual = option False (char 'T' $> True) -- | V2.1DATE -- -- (integer) This is the date in YYYYMMDDHHMMSSformat on which the news media used to constructthis GKG file was published. NOTE that unlike the main GDELT event stream files, this date representsthe date of publication of the documentfrom which the information was extracted –if the article discusses events in the past, the date is NOT time-shifted as it is for the GDELT event stream. This date will be the same for all rows in a file and is redundant from a data processing standpoint, but is provided to make it easier to load GKG files directly into an SQL database for analysis. NOTE: for somespecial collections this value may be 0 indicating that the field is either not applicable or not known for those materials. For example, OCR’d historical document collections may not have robust metadata on publication date.NOTE: the GKG 2.0 format still encoded this date in YYYYMMDD format, while under GKG 2.1 it is now in YYYYMMDDHHMMSS format. v21date :: Parser (Maybe LocalTime) v21date = optional localTime -- | V2SOURCECOLLECTIONIDENTIFIER -- -- (integer) This is a numeric identifier that refers to the source collection the document came from and is used to interpret the DocumentIdentifier in the next column. In essence, it specifies how to interpret the DocumentIdentifier to locate the actual document. At present, it can hold one of the following values: -- -- 1 = WEB (The document originates from the open web and the DocumentIdentifier is a fully-qualified URL that can be used to access the document on the web). -- -- 2 = CITATIONONLY (The document originates from a broadcast, print, or other offline source in which only a textual citation is available for the document. In this case the DocumentIdentifier contains the textual citation for the document). -- -- 3 = CORE (The document originates from the CORE archive and the DocumentIdentifier contains its DOI, suitable for accessing the original document through the CORE website). -- -- 4 = DTIC (The document originates from the DTIC archive and the DocumentIdentifier contains its DOI, suitable for accessing the original document through the DTIC website). -- -- 5= JSTOR (The document originates from the JSTOR archive and the DocumentIdentifier contains its DOI, suitable for accessing the original document through your JSTOR subscriptionif your institution subscribes to it). -- -- 6 = NONTEXTUALSOURCE (The document originates from a textual proxy (such as closed captioning) of a non-textual information source (such as a video) available via a URL and the DocumentIdentifier provides the URL of the non-textual original source. At present, this Collection Identifier is used for processing of the closed captioning streams of the Internet Archive Television News Archive in which each broadcast is available via a URL, but the URL offers access only tothe video of the broadcast and does not provide any access to the textual closed captioning used to generate the metadata.This code is used in order to draw a distinction between URL-based textual material (Collection Identifier 1 (WEB) and URL-based non-textual material like the Television News Archive). data SourceCollectionIdentifier = SCIWeb | SCICitationOnly | SCICore | SCIDTIC | SCIJSTOR | SCINonTextualSource deriving (Eq, Show, Enum, Generic) -- | Parser for a 'SourceCollectionIdentifier' sourceCollectionIdentifier :: Parser SourceCollectionIdentifier sourceCollectionIdentifier = toEnum . subtract 1 <$> digit {-| V2SOURCECOMMONNAME (text) This is a human-friendly identifier of the source of the document. For material originating from the open web with a URL this field will contain the top-level domain the page was from. For BBC Monitoring material it will contain “BBC Monitoring” and for JSTOR material it will contain “JSTOR.” This field is intended for human display of major sources as well as for network analysis of information flows by source, obviating the requirement to perform domain or other parsing of the DocumentIdentifier field -} newtype SourceCommonName = SourceCommonName Text deriving (Eq, Show, Generic) {-| V2DOCUMENTIDENTIFIER (text) This is the unique external identifier for the source document.It can be used to uniquely identify the document and access it if you have the necessary subscriptions or authorizations and/or the document is public access. This field can contain a range of values, from URLs ofopen web resourcesto textual citationsof print or broadcast materialto DOI identifiers for various document repositories.For example, if SOURCECOLLECTION is equal to 1, this field will contain a fully-qualified URL suitable for direct access. If SOURCECOLLECTION is equal to 2, this field will contain a textual citation akin to what would appear in an academic journal article referencing that document (NOTE that the actual citation format will vary (usually between APA, Chicago, Harvard, or MLA) depending on a number of factors and no assumptions should be made on its precise formatat this time due to the way in which this data is currently provided to GDELT –future efforts will focus on normalization of this field to a standard citation format). IfSOURCECOLLECTION is 3, the field will contain a numeric or alpha-numeric DOI that can be typed into JSTOR’s search engine to access the document if your institution has a JSTOR subscription. -} newtype DocumentIdentifierV2 = DocumentIdentifierV2 Text deriving (Eq, Show, Generic) -- | V1COUNTS -- -- (semicolon-delimited blocks, with pound symbol (“#”) delimited fields) This is the list of Counts found in this document. Each Count found is separated with a semicolon, while the fields within a Count are separated by the pound symbol (“#”).Unlike the primary GDELT event stream, these records are not issued unique identifier numbers, nor are they dated. -- -- As an example of how to interpret this file, an entry with CountType=KILL, Number=47, ObjectType=”jihadists” indicates that the article stated that 47 jihadists were killed. This field is identical in format and population as the correspondingfield in the GKG 1.0 format. -- -- - Count Type.(text) This is the value of the NAME field from the Category List spreadsheet indicating which category this count is of. At the time of this writing, this is most often AFFECT, ARREST, KIDNAP, KILL, PROTEST, SEIZE, or WOUND, though other categories may appear here as well in certain circumstances when they appear in context with one of these categories, or as other Count categories are added over time. A value of “PROTEST” in this field would indicatethat this is a count of the number of protesters at a protest. -- -- - Count.(integer) This is the actual count being reported. If CountType is “PROTEST” and Number is 126, this means that the source article contained a mention of 126 protesters. -- -- - Object Type.(text) This records any identifying information as to what the number refers to. For example, a mention of “20 Christian missionaries were arrested” will result in “Christian missionaries” being captured here. This field will be blank in cases where no identifying information could be identified. -- -- - LocationType. See the documentation for V1Locations below. -- - Location FullName.See the documentation for V1Locations below. -- - Location CountryCode.See the documentation for V1Locations below. -- - Location ADM1Code.See the documentation for V1Locations below. -- - Location Latitude. See the documentation for V1Locations below. -- - Location Longitude. See the documentation for V1Locations below. -- - Location FeatureID. See the documentation for V1Locations below. data CountsV1 = CountsV1 { cCountTy :: CountTy , cCount :: Int , cObjectType :: Maybe Text , cLocationV1 :: LocationV1 } deriving (Eq, Show, Generic) data CountTy = CTAffect | CTArrest | CTKidnap | CTKill | CTProtest | CTSeize | CTWound | CTOther Text deriving (Eq, Show, Generic) countTy :: Parser CountTy countTy = (string "AFFECT" $> CTAffect) <|> (string "ARREST" $> CTArrest) <|> (string "KIDNAP" $> CTKidnap) <|> (string "KILL" $> CTKill) <|> (string "PROTEST" $> CTProtest) <|> (string "SEIZE" $> CTSeize) <|> (string "WOUND" $> CTWound) -- (string "AFFECT" $> CTAffect) -- | Parser for a 'CountsV1' countsV1 :: Parser CountsV1 countsV1 = do cty <- countTy hash n <- decimal hash oty <- optional (pack <$> many letterChar) hash loc <- locationV1 pure $ CountsV1 cty n oty loc -- | V2.1COUNTS -- -- (semicolon-delimited blocks, with pound symbol (“#”) delimited fields) This field is identical to the V1COUNTS field except that it adds a final additional field to the end of each entry that records its approximate character offset in the document, allowing it to be associated with other entries from other “V2ENHANCED” fields (or Events) that appear in closest proximity to it. Note:unlike the other location-related fields, the Counts field does NOT add ADM2 support at this time. This is to maintain compatibility with assumptions that many applications make about the contents of the Count field. Those applications needing ADM2 support for Counts should cross-reference the FeatureID field of a given Count against the V2Locations field to determine its ADM2 value. data CountsV21 = CountsV21 { c21countsV1 :: CountsV1 , c21charOffset :: Int } deriving (Eq, Show, Generic) -- | Parser for a 'CountsV21' countsV21 :: Parser CountsV21 countsV21 = do c1 <- countsV1 hash i <- decimal pure $ CountsV21 c1 i {-| V1THEMES.(semi-colon-delimited) This is the list of all Themes found in the document. For the complete list of possible themes, see the Category List spreadsheet. At the time of this writing there are over 275 themes currently recognized by the system. This field is identical in format and population as the corresponding field in the GKG1.0 format. -} -- TODO data ThemesV1 -- themesV1List :: Parser [ThemesV1] -- themesV1List = sepBy themesV1 semicolon {-| V2ENHANCEDTHEMES.(semicolon-delimited blocks, with comma-delimited fields) This contains a list of all GKG themes referenced in the document, along with the character offsets of approximately where in the document they were found. For the complete list of possible themes, see the Category List spreadsheet. At the time of this writing there are over 300 themes currently recognized by the system. Each theme reference is separated by a semicolon, and within each reference, the name of the theme is specified first, followed by a comma, and then the approximate character offset of the reference of that theme in the document, allowing it to be associated with other entries from other “V2ENHANCED” fields that appear in closest proximity to it. If a theme is mentioned multiple times in a document, each mention will appear separately in this field. -} -- TODO data ThemesV2 -- themesV2enhanced :: Parser [(ThemesV1, Int)] -- themesV2enhanced = sepBy p semicolon -- where -- p = (,) <$> themesV1 <*> decimal {-| V1LOCATIONS (semicolon-delimited blocks, with pound symbol (“#”) delimited fields) This is a list of all locations found in the text, extracted through the Leetaru (2012) algorithm. 2The algorithm is run in a more aggressive stance here than ordinary in order to extract every possible locative referent, so may have a slightly elevated level of false positives. NOTE:some locations have multiple accepted formal or informal names and this field is collapsed on name, rather than feature (since in some applications the understanding of a geographic feature differs based on which name was used to reference it). In cases where it is necessary to collapse by feature, the Geo_FeatureID column should be used, rather than the Geo_Fullname column. This is because the Geo_Fullname column captures the name of the location as expressed in the text and thus reflects differences in transliteration, alternative spellings, and alternative names for the same location. For example, Mecca is often spelled Makkah, while Jeddah is commonly spelled Jiddah or Jaddah. The Geo_Fullname column will reflect each of these different spellings, while the Geo_FeatureID column will resolve them all to the same unique GNS or GNIS feature identification number. For more information on the GNS andGNIS identifiers, see Leetaru (2012). 3This field is identical in format and population as the corresponding field in the GKG 1.0 format. NOTE:there was an error in this field from 2/19/2015 through midday 3/1/2015 that caused the CountryCode field to listthe wrong country code in somecases. - Location Type. (integer) This field specifies the geographic resolution of the match type and holds one of the following values: 1=COUNTRY (match was at the country level), 2=USSTATE (match was to a US state), 3=USCITY (match was to a US city or landmark), 4=WORLDCITY (match was to a city or landmark outside the US), 5=WORLDSTATE (match was to an Administrative Division 1 outside the US –roughly equivalent to a US state). This can be used to filter counts by geographic specificity, for example, extracting only those counts with a landmark-level geographic resolution for mapping. Note that matches with codes 1 (COUNTRY), 2 (USSTATE), and 5 (WORLDSTATE) will still provide a latitude-longitude pair, which will be the centroid of that country or state, but the FeatureID field below will contain its textual country or ADM1 code instead of a numeric featureid. - Location FullName.(text) This is the full human-readable name of the matched location. In the case of a country it is simply the country name. For US and World states it is in the format of “State, Country Name”, while for all other matches it is in the format of “City/Landmark, State, Country”. This can be used to label locations when placing counts on a map. Note: this field reflects the precise name used to refer to the location in the text itself, meaning it may contain multiple spellings of the same location –use the FeatureID column to determine whether two location names refer to the same place. - Location CountryCode. (text) This is the 2-character FIPS10-4 country code for the location. Note:GDELT continues to use the FIPS10-4 codes under USG guidance while GNS continues its formal transitionto the successor Geopolitical Entities, Names, and Codes (GENC) Standard (the US Government profile of ISO 3166). - Location ADM1Code. (text) This is the 2-character FIPS10-4 country code followed by the 2-character FIPS10-4 administrative division 1 (ADM1) code for the administrative division housing the landmark. In the case of the United States, this is the 2-character shortform of the state’s name (such as “TX” for Texas). Note : see the notice above for CountryCode regarding the FIPS10-4 - GENC transition. Note: to obtain ADM2 (district-level) assignments for locations, you can either perform a spatial join against a ShapeFile template in any GIS software, or cross-walk the FeatureID to the GNIS-GNS databases –this will provide additional fields such as ADM2 codes and MGRS grid references for GNS. - Location Latitude. (floating point number) This is the centroid latitude of the landmark for mapping.In the case of a country or administrative division this will reflect the centroid of that entire country resp. division. - Location Longitude. (floating point number) This is the centroid longitude of the landmark for mapping.In the case of a country or administrative division this will reflect the centroid of that entire country resp. division. - Location FeatureID. (text OR signed integer) This is the numeric GNS or GNIS FeatureID for this location OR a textual country or ADM1 code. More information on these values can be found in Leetaru (2012). Note : This field will be blank or contain a textual ADM1 code for country or ADM1-level matches –see above. Note: For numeric GNS or GNIS FeatureIDs, this field can contain both positive and negative numbers, see Leetaru (2012) for more information on this. -} data LocationV1 = LocationV1 { locTy :: LocationTy , locFullName :: Text , locCountryCode :: Text , locADM1Code :: ADM1 , locLat :: Latitude , locLon :: Longitude , locFeatureId :: Either Text Int } deriving (Eq, Show, Generic) -- | Parser for a 'LocationV1' locationV1 :: Parser LocationV1 locationV1 = do lty <- locationTy hash lfn <- locationFullName hash lcc <- locationCountryCode hash a <- adm1 hash lat <- latitude hash lon <- longitude hash fid <- featureId pure $ LocationV1 lty lfn lcc a lat lon fid data LocationTy = LTCountry | LTUSState | LTUSCity | LTWorldCity | LTWorldState deriving (Eq, Show, Enum, Generic) locationTy :: Parser LocationTy locationTy = toEnum . subtract 1 <$> decimal locationFullName :: Parser Text locationFullName = pack <$> many (letterChar <|> char ',' <|> spaceChar) locationCountryCode :: Parser Text locationCountryCode = pack <$> count 2 letterChar featureId :: Parser (Either Text Int) featureId = (Left . pack <$> count 2 letterChar) <|> (Right <$> decimal) {-| V2ENHANCEDLOCATIONS. (semicolon-delimited blocks, with pound symbol (“#”) delimited fields) This field is identical to the V1LOCATIONS field with the primary exception of an extra field appended to the end of each location block after its FeatureID that lists the approximate character offset of the reference to that location in the text. In addition, if a location appears multiple times in the article, it will be listed multiple times in this field. The only other modification from V1LOCATIONS is the addition of a single new field “Location ADM2Code” in between “Location ADM1Code” and “Location Latitude”. - NOTE : there was an error in this field from 2/19/2015 through midday 3/1/2015 that caused the CountryCode field to list the wrong country code in some cases. -}