Last active
October 13, 2019 23:07
-
-
Save brockmanmatt/3874a314e8aec3e0f847dd3e59fcfaf1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
## This assumes a jupyter notebook file to install gdelt | |
## if not using notebook, remove and run on the command line for your OS | |
!pip install gdelt | |
## (the ! gives command line capabilities to notebook) | |
import gdelt | |
gd = gdelt.gdelt(version=1) | |
#Create a data folder if not exist | |
import os | |
os.makedirs("data",exist_ok=True) | |
#download articles from 8/8/19 to 10/6/19 if not already in data folder | |
import datetime | |
cur_date = datetime.datetime(2019,10,7)-datetime.timedelta(days=60) | |
end_date = datetime.datetime(2019,10,7) | |
while cur_date < end_date: | |
print("%s-%s-%s"%(cur_date.year, cur_date.month, cur_date.day)) | |
if not os.path.exists("data/%s-%s-%s.pkl"%(cur_date.year, cur_date.month, cur_date.day)): | |
year = cur_date.year | |
month = str(cur_date.month) | |
day = str(cur_date.day) | |
if cur_date.month < 10: | |
month = "0"+month | |
if cur_date.day < 10: | |
day = "0"+day | |
results = gd.Search(['%s %s %s'%(year, month, day)],table='gkg',coverage=True, translation=False) | |
results.to_pickle("data/%s-%s-%s.pkl"%(cur_date.year, cur_date.month, cur_date.day)) | |
cur_date+=datetime.timedelta(days=1) | |
#load each day's articles from the data folder | |
#this is a brute force way to do it but allows generality | |
#if doing longer period, put all into a database and load that way (e.g. sql) | |
df = pd.DataFrame() | |
k = os.listdir("data") | |
for i in k: | |
print(i) | |
if i.endswith(".pkl"): | |
tmp = pd.read_pickle("data/"+i) | |
tmp = tmp[tmp["SOURCES"].apply(lambda x: x in mySources)] | |
df = pd.concat([df, tmp]) | |
#format the dataframe and select articles of interest | |
df.DATE = df.DATE.apply(lambda x: str(x)) | |
df.DATE = pd.to_datetime(df.DATE) | |
df.fillna("", inplace=True) | |
df.set_index("DATE", drop=True, inplace=True) | |
df["dprk"] = df["LOCATIONS"].apply(lambda x: x.find("North Korea") > -1) | |
df["ukraine"] = df["LOCATIONS"].apply(lambda x: x.find("Ukraine") > -1) | |
df["russia"] = df["LOCATIONS"].apply(lambda x: x.find("Russia") > -1) | |
df["iran"] = df["LOCATIONS"].apply(lambda x: x.find("Iran") > -1) | |
df["china"] = df["LOCATIONS"].apply(lambda x: x.find("China") > -1) | |
loc_df = df.groupby(["SOURCES", "DATE"])[["dprk", "ukraine", "russia", "iran", "china"]].sum() | |
mySources = ["nytimes.com", "washingtonpost.com", "foxnews.com", "cnn.com"] | |
#create time_series dataframe which is going to have index of all the dates, then each column | |
#is publisher + topic | |
time_series = pd.DataFrame() | |
for publisher in mySources: | |
time_series = pd.concat([time_series, loc_df.ix[publisher].add_prefix("{}_".format(publisher))], axis=1) | |
#visualize time series to make sure they loaded properly | |
time_series.plot(figsize=(20,10), title="Articles Per Day Mentioning Various Countries by Publisher") | |
#df to keep track of scoring | |
mape_df = pd.DataFrame() | |
#Run SARIMA model on old data; using mean absolute percentage error instead of root square error | |
#this makes it easier to compare performance | |
# see https://towardsdatascience.com/modeling-news-coverage-with-python-part-2-starting-news-forecasting-with-limited-data-5c1092de3ea9 | |
fig, axs = plt.subplots(4, 5, sharex=True) | |
fig.set_size_inches(16,12) | |
x = y = 0 | |
for issue in time_series: | |
if issue.find(".com") < 0: | |
continue | |
train_l = len(time_series)-5 | |
selected_series = time_series[[col for col in time_series.columns if (col.find(issue[issue.find("_"):]) > -1)]] | |
pub_series = time_series[[col for col in time_series.columns if (col.find(issue[:issue.find("_")]) > -1)]].drop(columns=issue) | |
selected_series = selected_series.join(pub_series) | |
s_model = SARIMAX(endog = selected_series[[issue]][:train_l][1:], | |
exog = selected_series[[x for x in selected_series.columns if x != issue]][:train_l].shift().add_suffix("_l1")[1:], | |
order=(3,1,1), seasonal_order=(1,0,1,7)).fit() | |
f_ru = selected_series[[issue]].copy()[1:] | |
f_ru.columns = ["actual"] | |
f_ru["predicted"] = s_model.predict(end=datetime.datetime(2019, 10, 6), endog = selected_series[[issue]][-5:],exog = selected_series[[x for x in selected_series.columns if x != issue]].shift()[-5:], | |
dynamic= False) | |
testing = f_ru.copy() | |
testing["error"] = np.abs((testing["actual"] - testing["predicted"]) / testing["actual"]) | |
fit = round(testing[testing["actual"] != 0].error.mean()*100) | |
mape_df.loc[issue, "NewspapersOnly_model"] = fit | |
testing2 = testing[-5:] | |
fit_p = round(testing2[testing2["actual"] != 0].error.mean()*100) | |
mape_df.loc[issue, "NewspapersOnly_predicted"] = fit_p | |
f_ru["actual"].plot(title="{}\nMAPE: test: {}% model: {}%".format(issue, fit_p, fit), ax=axs[x,y]) | |
f_ru["predicted"][:-5].plot(color="orange", label="predicted: Train", ax=axs[x,y]) | |
f_ru["predicted"][-6:].plot(color="red", label="predicted: Test", ax=axs[x,y]) | |
x+=1 | |
if x > 3: | |
x =0 | |
y+=1 | |
handles, labels = axs[0,0].get_legend_handles_labels() | |
fig.legend(handles, labels, loc='center right') | |
fig.suptitle("News forecast using within publication and external topic") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment