Skip to content

Instantly share code, notes, and snippets.

@brockmanmatt
Created October 13, 2019 23:23
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save brockmanmatt/ee7767757386db2365c8f24118d59ffc to your computer and use it in GitHub Desktop.
Save brockmanmatt/ee7767757386db2365c8f24118d59ffc to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
!pip install gdelt
import gdelt
gd = gdelt.gdelt(version=1)
import os
os.makedirs("data",exist_ok=True)
import datetime
cur_date = datetime.datetime(2019,10,7)-datetime.timedelta(days=60)
end_date = datetime.datetime(2019,10,7)
while cur_date < end_date:
print("%s-%s-%s"%(cur_date.year, cur_date.month, cur_date.day))
if not os.path.exists("data/%s-%s-%s.pkl"%(cur_date.year, cur_date.month, cur_date.day)):
year = cur_date.year
month = str(cur_date.month)
day = str(cur_date.day)
if cur_date.month < 10:
month = "0"+month
if cur_date.day < 10:
day = "0"+day
results = gd.Search(['%s %s %s'%(year, month, day)],table='gkg',coverage=True, translation=False)
results.to_pickle("data/%s-%s-%s.pkl"%(cur_date.year, cur_date.month, cur_date.day))
cur_date+=datetime.timedelta(days=1)
df = pd.DataFrame()
k = os.listdir("data")
for i in k:
print(i)
if i.endswith(".pkl"):
tmp = pd.read_pickle("data/"+i)
tmp = tmp[tmp["SOURCES"].apply(lambda x: x in mySources)]
df = pd.concat([df, tmp])
df.DATE = df.DATE.apply(lambda x: str(x))
df.DATE = pd.to_datetime(df.DATE)
df.fillna("", inplace=True)
df.set_index("DATE", drop=True, inplace=True)
df["dprk"] = df["LOCATIONS"].apply(lambda x: x.find("North Korea") > -1)
df["ukraine"] = df["LOCATIONS"].apply(lambda x: x.find("Ukraine") > -1)
df["russia"] = df["LOCATIONS"].apply(lambda x: x.find("Russia") > -1)
df["iran"] = df["LOCATIONS"].apply(lambda x: x.find("Iran") > -1)
df["china"] = df["LOCATIONS"].apply(lambda x: x.find("China") > -1)
loc_df = df.groupby(["SOURCES", "DATE"])[["dprk", "ukraine", "russia", "iran", "china"]].sum()
mySources = ["nytimes.com", "washingtonpost.com", "foxnews.com", "cnn.com"]
time_series = pd.DataFrame()
for publisher in mySources:
time_series = pd.concat([time_series, loc_df.ix[publisher].add_prefix("{}_".format(publisher))], axis=1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment