Skip to content

Instantly share code, notes, and snippets.

@brockmanmatt
Last active October 13, 2019 23:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save brockmanmatt/3874a314e8aec3e0f847dd3e59fcfaf1 to your computer and use it in GitHub Desktop.
Save brockmanmatt/3874a314e8aec3e0f847dd3e59fcfaf1 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
## This assumes a jupyter notebook file to install gdelt
## if not using notebook, remove and run on the command line for your OS
!pip install gdelt
## (the ! gives command line capabilities to notebook)
import gdelt
gd = gdelt.gdelt(version=1)
#Create a data folder if not exist
import os
os.makedirs("data",exist_ok=True)
#download articles from 8/8/19 to 10/6/19 if not already in data folder
import datetime
cur_date = datetime.datetime(2019,10,7)-datetime.timedelta(days=60)
end_date = datetime.datetime(2019,10,7)
while cur_date < end_date:
print("%s-%s-%s"%(cur_date.year, cur_date.month, cur_date.day))
if not os.path.exists("data/%s-%s-%s.pkl"%(cur_date.year, cur_date.month, cur_date.day)):
year = cur_date.year
month = str(cur_date.month)
day = str(cur_date.day)
if cur_date.month < 10:
month = "0"+month
if cur_date.day < 10:
day = "0"+day
results = gd.Search(['%s %s %s'%(year, month, day)],table='gkg',coverage=True, translation=False)
results.to_pickle("data/%s-%s-%s.pkl"%(cur_date.year, cur_date.month, cur_date.day))
cur_date+=datetime.timedelta(days=1)
#load each day's articles from the data folder
#this is a brute force way to do it but allows generality
#if doing longer period, put all into a database and load that way (e.g. sql)
df = pd.DataFrame()
k = os.listdir("data")
for i in k:
print(i)
if i.endswith(".pkl"):
tmp = pd.read_pickle("data/"+i)
tmp = tmp[tmp["SOURCES"].apply(lambda x: x in mySources)]
df = pd.concat([df, tmp])
#format the dataframe and select articles of interest
df.DATE = df.DATE.apply(lambda x: str(x))
df.DATE = pd.to_datetime(df.DATE)
df.fillna("", inplace=True)
df.set_index("DATE", drop=True, inplace=True)
df["dprk"] = df["LOCATIONS"].apply(lambda x: x.find("North Korea") > -1)
df["ukraine"] = df["LOCATIONS"].apply(lambda x: x.find("Ukraine") > -1)
df["russia"] = df["LOCATIONS"].apply(lambda x: x.find("Russia") > -1)
df["iran"] = df["LOCATIONS"].apply(lambda x: x.find("Iran") > -1)
df["china"] = df["LOCATIONS"].apply(lambda x: x.find("China") > -1)
loc_df = df.groupby(["SOURCES", "DATE"])[["dprk", "ukraine", "russia", "iran", "china"]].sum()
mySources = ["nytimes.com", "washingtonpost.com", "foxnews.com", "cnn.com"]
#create time_series dataframe which is going to have index of all the dates, then each column
#is publisher + topic
time_series = pd.DataFrame()
for publisher in mySources:
time_series = pd.concat([time_series, loc_df.ix[publisher].add_prefix("{}_".format(publisher))], axis=1)
#visualize time series to make sure they loaded properly
time_series.plot(figsize=(20,10), title="Articles Per Day Mentioning Various Countries by Publisher")
#df to keep track of scoring
mape_df = pd.DataFrame()
#Run SARIMA model on old data; using mean absolute percentage error instead of root square error
#this makes it easier to compare performance
# see https://towardsdatascience.com/modeling-news-coverage-with-python-part-2-starting-news-forecasting-with-limited-data-5c1092de3ea9
fig, axs = plt.subplots(4, 5, sharex=True)
fig.set_size_inches(16,12)
x = y = 0
for issue in time_series:
if issue.find(".com") < 0:
continue
train_l = len(time_series)-5
selected_series = time_series[[col for col in time_series.columns if (col.find(issue[issue.find("_"):]) > -1)]]
pub_series = time_series[[col for col in time_series.columns if (col.find(issue[:issue.find("_")]) > -1)]].drop(columns=issue)
selected_series = selected_series.join(pub_series)
s_model = SARIMAX(endog = selected_series[[issue]][:train_l][1:],
exog = selected_series[[x for x in selected_series.columns if x != issue]][:train_l].shift().add_suffix("_l1")[1:],
order=(3,1,1), seasonal_order=(1,0,1,7)).fit()
f_ru = selected_series[[issue]].copy()[1:]
f_ru.columns = ["actual"]
f_ru["predicted"] = s_model.predict(end=datetime.datetime(2019, 10, 6), endog = selected_series[[issue]][-5:],exog = selected_series[[x for x in selected_series.columns if x != issue]].shift()[-5:],
dynamic= False)
testing = f_ru.copy()
testing["error"] = np.abs((testing["actual"] - testing["predicted"]) / testing["actual"])
fit = round(testing[testing["actual"] != 0].error.mean()*100)
mape_df.loc[issue, "NewspapersOnly_model"] = fit
testing2 = testing[-5:]
fit_p = round(testing2[testing2["actual"] != 0].error.mean()*100)
mape_df.loc[issue, "NewspapersOnly_predicted"] = fit_p
f_ru["actual"].plot(title="{}\nMAPE: test: {}% model: {}%".format(issue, fit_p, fit), ax=axs[x,y])
f_ru["predicted"][:-5].plot(color="orange", label="predicted: Train", ax=axs[x,y])
f_ru["predicted"][-6:].plot(color="red", label="predicted: Test", ax=axs[x,y])
x+=1
if x > 3:
x =0
y+=1
handles, labels = axs[0,0].get_legend_handles_labels()
fig.legend(handles, labels, loc='center right')
fig.suptitle("News forecast using within publication and external topic")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment