Skip to content

Instantly share code, notes, and snippets.

@ahalterman
Last active December 18, 2015 15:48
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ahalterman/5806469 to your computer and use it in GitHub Desktop.
Save ahalterman/5806469 to your computer and use it in GitHub Desktop.
Subsetting GDELT for domestic events using R. I'm looking at domestic activities coded by GDELT, including protests. This is my walkthrough of how I subset only events occuring inside Georgia between 1979 and 2012 in the GDELT reduced dataset. 1. if you just use the python script to subset the full (reduced) dataset, you end up with only events …
# Example for subsetting domestic events in Georgia from the GDELT reduced dataset.
# Read in the python output file.
GEO.ALL <- read.table("./R/GDELT/GEO.ALL.select.outfile.txt",sep="\t", header=TRUE)
# The header=T command didn't work, so fix that:
names(GEO.ALL) <- c("Day","Actor1Code","Actor2Code","EventCode","QuadCategory","GoldsteinScale",
"Actor1Geo_Lat","Actor1Geo_Long","Actor2Geo_Lat","Actor2Geo_Long","ActionGeo_Lat","ActionGeo_Long")
# To keep our subsetting function manageable, prep the GEO.ALL dataframe by substringing the first
# three letters of both Actor columns into new columns:
GEO.ALL$Actor1short <- substring(GEO.ALL$Actor1Code, 1, 3)
GEO.ALL$Actor2short <- substring(GEO.ALL$Actor2Code, 1, 3)
# Define a handy "not in" function (Thanks Paul Teetor).
`%notin%` <- function(x,y) !(x %in% y)
# Finally, the big subset. This subsets all the rows in which there is no match between the two
# shortened actor codes and the list of all CAMEO country codes (minus Georgia, obviously).
georgia <- subset(GEO.ALL, GEO.ALL$Actor1short %notin%
c("AFG", "ALA", "ALB", "DZA", "ASM", "AND", "AGO", "AIA", "ATG",
"ARG", "ARM", "ABW", "AUS", "AUT", "AZE", "BHS", "BHR", "BGD",
"BRB", "BLR", "BEL", "BLZ", "BEN", "BMU", "BTN", "BOL", "BIH",
"BWA", "BRA", "VGB", "BRN", "BGR", "BFA", "BDI", "KHM", "CMR",
"CAN", "CPV", "CYM", "CAF", "TCD", "CHL", "CHN", "COL", "COM",
"COD", "COG", "COK", "CRI", "CIV", "HRV", "CUB", "CYP", "CZE",
"DNK", "DJI", "DMA", "DOM", "TMP", "ECU", "EGY", "SLV", "GNQ",
"ERI", "EST", "ETH", "FRO", "FLK", "FJI", "FIN", "FRA", "GUF",
"PYF", "GAB", "GMB", "DEU", "GHA", "GIB", "GRC", "GRL",
"GRD", "GLP", "GUM", "GTM", "GIN", "GNB", "GUY", "HTI", "VAT",
"HND", "HKG", "HUN", "ISL", "IND", "IDN", "IRN", "IRQ", "IRL",
"IMY", "ISR", "ITA", "JAM", "JPN", "JOR", "KAZ", "KEN", "KIR",
"PRK", "KOR", "KWT", "KGZ", "LAO", "LVA", "LBN", "LSO", "LBR",
"LBY", "LIE", "LTU", "LUX", "MAC", "MKD", "MDG", "MWI", "MYS",
"MDV", "MLI", "MLT", "MHL", "MTQ", "MRT", "MUS", "MYT", "MEX",
"FSM", "MDA", "MCO", "MNG", "MTN", "MSR", "MAR", "MOZ", "MMR",
"NAM", "NRU", "NPL", "NLD", "ANT", "NCL", "NZL", "NIC", "NER",
"NGA", "NIU", "NFK", "MNP", "NOR", "PSE", "OMN", "PAK", "PLW",
"PAN", "PNG", "PRY", "PER", "PHL", "PCN", "POL", "PRT", "PRI",
"QAT", "REU", "ROM", "RUS", "RWA", "SHN", "KNA", "LCA", "SPM",
"VCT", "WSM", "SMR", "STP", "SAU", "SEN", "SRB", "SYC", "SLE",
"SGP", "SVK", "SVN", "SLB", "SOM", "ZAF", "ESP", "LKA", "SDN",
"SUR", "SJM", "SWZ", "SWE", "CHE", "SYR", "TJK", "TZA", "THA",
"TGO", "TKL", "TON", "TTO", "TUN", "TUR", "TKM", "TCA", "TUV",
"UGA", "UKR", "ARE", "GBR", "USA", "VIR", "URY", "UZB", "VUT",
"VEN", "VNM", "WLF", "ESH", "YEM", "ZMB", "ZWE")
& GEO.ALL$Actor2short %notin%
c("AFG", "ALA", "ALB", "DZA", "ASM", "AND", "AGO", "AIA", "ATG",
"ARG", "ARM", "ABW", "AUS", "AUT", "AZE", "BHS", "BHR", "BGD",
"BRB", "BLR", "BEL", "BLZ", "BEN", "BMU", "BTN", "BOL", "BIH",
"BWA", "BRA", "VGB", "BRN", "BGR", "BFA", "BDI", "KHM", "CMR",
"CAN", "CPV", "CYM", "CAF", "TCD", "CHL", "CHN", "COL", "COM",
"COD", "COG", "COK", "CRI", "CIV", "HRV", "CUB", "CYP", "CZE",
"DNK", "DJI", "DMA", "DOM", "TMP", "ECU", "EGY", "SLV", "GNQ",
"ERI", "EST", "ETH", "FRO", "FLK", "FJI", "FIN", "FRA", "GUF",
"PYF", "GAB", "GMB", "DEU", "GHA", "GIB", "GRC", "GRL",
"GRD", "GLP", "GUM", "GTM", "GIN", "GNB", "GUY", "HTI", "VAT",
"HND", "HKG", "HUN", "ISL", "IND", "IDN", "IRN", "IRQ", "IRL",
"IMY", "ISR", "ITA", "JAM", "JPN", "JOR", "KAZ", "KEN", "KIR",
"PRK", "KOR", "KWT", "KGZ", "LAO", "LVA", "LBN", "LSO", "LBR",
"LBY", "LIE", "LTU", "LUX", "MAC", "MKD", "MDG", "MWI", "MYS",
"MDV", "MLI", "MLT", "MHL", "MTQ", "MRT", "MUS", "MYT", "MEX",
"FSM", "MDA", "MCO", "MNG", "MTN", "MSR", "MAR", "MOZ", "MMR",
"NAM", "NRU", "NPL", "NLD", "ANT", "NCL", "NZL", "NIC", "NER",
"NGA", "NIU", "NFK", "MNP", "NOR", "PSE", "OMN", "PAK", "PLW",
"PAN", "PNG", "PRY", "PER", "PHL", "PCN", "POL", "PRT", "PRI",
"QAT", "REU", "ROM", "RUS", "RWA", "SHN", "KNA", "LCA", "SPM",
"VCT", "WSM", "SMR", "STP", "SAU", "SEN", "SRB", "SYC", "SLE",
"SGP", "SVK", "SVN", "SLB", "SOM", "ZAF", "ESP", "LKA", "SDN",
"SUR", "SJM", "SWZ", "SWE", "CHE", "SYR", "TJK", "TZA", "THA",
"TGO", "TKL", "TON", "TTO", "TUN", "TUR", "TKM", "TCA", "TUV",
"UGA", "UKR", "ARE", "GBR", "USA", "VIR", "URY", "UZB", "VUT",
"VEN", "VNM", "WLF", "ESH", "YEM", "ZMB", "ZWE"))
# dim(georgia) gives you 15,496, all domestic events. There you go.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment