brockmanmatt/SetupArticleDF.py

## SetupArticleDF.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## This assumes a jupyter notebook file to install gdelt
## if not using notebook, remove and run on the command line for your OS
!pip install gdelt
## (the ! gives command line capabilities to notebook)

import gdelt
gd = gdelt.gdelt(version=1)

#Create a data folder if not exist
import os
os.makedirs("data",exist_ok=True)


#download articles from 8/8/19 to 10/6/19 if not already in data folder
import datetime
cur_date = datetime.datetime(2019,10,7)-datetime.timedelta(days=60)
end_date = datetime.datetime(2019,10,7)

while cur_date < end_date:

  print("%s-%s-%s"%(cur_date.year, cur_date.month, cur_date.day))
  if not os.path.exists("data/%s-%s-%s.pkl"%(cur_date.year, cur_date.month, cur_date.day)):

    year = cur_date.year
    month = str(cur_date.month)
    day = str(cur_date.day)

    if cur_date.month < 10:
      month = "0"+month
    if cur_date.day < 10:
      day = "0"+day

    results = gd.Search(['%s %s %s'%(year, month, day)],table='gkg',coverage=True, translation=False)
    results.to_pickle("data/%s-%s-%s.pkl"%(cur_date.year, cur_date.month, cur_date.day))

  cur_date+=datetime.timedelta(days=1)

#load each day's articles from the data folder
#this is a brute force way to do it but allows generality
#if doing longer period, put all into a database and load that way (e.g. sql)
df = pd.DataFrame()
k = os.listdir("data")
for i in k:
  print(i)
  if i.endswith(".pkl"):
    tmp = pd.read_pickle("data/"+i)
    tmp = tmp[tmp["SOURCES"].apply(lambda x: x in mySources)]
    df = pd.concat([df, tmp])


#format the dataframe and select articles of interest
df.DATE = df.DATE.apply(lambda x: str(x))
df.DATE = pd.to_datetime(df.DATE)
df.fillna("", inplace=True)
df.set_index("DATE", drop=True, inplace=True)

df["dprk"] = df["LOCATIONS"].apply(lambda x: x.find("North Korea") > -1)
df["ukraine"] = df["LOCATIONS"].apply(lambda x: x.find("Ukraine") > -1)
df["russia"] = df["LOCATIONS"].apply(lambda x: x.find("Russia") > -1)
df["iran"] = df["LOCATIONS"].apply(lambda x: x.find("Iran") > -1)
df["china"] = df["LOCATIONS"].apply(lambda x: x.find("China") > -1)

loc_df = df.groupby(["SOURCES", "DATE"])[["dprk", "ukraine", "russia", "iran", "china"]].sum()

mySources = ["nytimes.com", "washingtonpost.com", "foxnews.com", "cnn.com"]

#create time_series dataframe which is going to have index of all the dates, then each column
#is publisher + topic
time_series = pd.DataFrame()
for publisher in mySources:
  time_series = pd.concat([time_series, loc_df.ix[publisher].add_prefix("{}_".format(publisher))], axis=1)

#visualize time series to make sure they loaded properly
time_series.plot(figsize=(20,10), title="Articles Per Day Mentioning Various Countries by Publisher")

#df to keep track of scoring
mape_df = pd.DataFrame()

#Run SARIMA model on old data; using mean absolute percentage error instead of root square error
#this makes it easier to compare performance
# see https://towardsdatascience.com/modeling-news-coverage-with-python-part-2-starting-news-forecasting-with-limited-data-5c1092de3ea9

fig, axs = plt.subplots(4, 5, sharex=True)

fig.set_size_inches(16,12)

x = y = 0

for issue in time_series:

  if issue.find(".com") < 0:
    continue

  train_l = len(time_series)-5

  selected_series = time_series[[col for col in time_series.columns if (col.find(issue[issue.find("_"):]) > -1)]]
  pub_series = time_series[[col for col in time_series.columns if (col.find(issue[:issue.find("_")]) > -1)]].drop(columns=issue)
  selected_series = selected_series.join(pub_series)

  s_model = SARIMAX(endog = selected_series[[issue]][:train_l][1:],
                    exog = selected_series[[x for x in selected_series.columns if x != issue]][:train_l].shift().add_suffix("_l1")[1:],
                    order=(3,1,1), seasonal_order=(1,0,1,7)).fit()


  f_ru = selected_series[[issue]].copy()[1:]
  f_ru.columns = ["actual"]
  f_ru["predicted"] = s_model.predict(end=datetime.datetime(2019, 10, 6), endog = selected_series[[issue]][-5:],exog = selected_series[[x for x in selected_series.columns if x != issue]].shift()[-5:],
                                            dynamic= False)

  testing = f_ru.copy()

  testing["error"] = np.abs((testing["actual"] - testing["predicted"]) / testing["actual"])
  fit = round(testing[testing["actual"] != 0].error.mean()*100)

  mape_df.loc[issue, "NewspapersOnly_model"] = fit

  testing2 = testing[-5:]
  fit_p = round(testing2[testing2["actual"] != 0].error.mean()*100)
  mape_df.loc[issue, "NewspapersOnly_predicted"] = fit_p


  f_ru["actual"].plot(title="{}\nMAPE: test: {}% model: {}%".format(issue, fit_p, fit), ax=axs[x,y])
  f_ru["predicted"][:-5].plot(color="orange", label="predicted: Train", ax=axs[x,y])
  f_ru["predicted"][-6:].plot(color="red", label="predicted: Test", ax=axs[x,y])

  x+=1
  if x > 3:
    x =0
    y+=1

handles, labels = axs[0,0].get_legend_handles_labels()
fig.legend(handles, labels, loc='center right')

fig.suptitle("News forecast using within publication and external topic")
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt

	## This assumes a jupyter notebook file to install gdelt
	## if not using notebook, remove and run on the command line for your OS
	!pip install gdelt
	## (the ! gives command line capabilities to notebook)

	import gdelt
	gd = gdelt.gdelt(version=1)

	#Create a data folder if not exist
	import os
	os.makedirs("data",exist_ok=True)


	#download articles from 8/8/19 to 10/6/19 if not already in data folder
	import datetime
	cur_date = datetime.datetime(2019,10,7)-datetime.timedelta(days=60)
	end_date = datetime.datetime(2019,10,7)

	while cur_date < end_date:

	print("%s-%s-%s"%(cur_date.year, cur_date.month, cur_date.day))
	if not os.path.exists("data/%s-%s-%s.pkl"%(cur_date.year, cur_date.month, cur_date.day)):

	year = cur_date.year
	month = str(cur_date.month)
	day = str(cur_date.day)

	if cur_date.month < 10:
	month = "0"+month
	if cur_date.day < 10:
	day = "0"+day

	results = gd.Search(['%s %s %s'%(year, month, day)],table='gkg',coverage=True, translation=False)
	results.to_pickle("data/%s-%s-%s.pkl"%(cur_date.year, cur_date.month, cur_date.day))

	cur_date+=datetime.timedelta(days=1)

	#load each day's articles from the data folder
	#this is a brute force way to do it but allows generality
	#if doing longer period, put all into a database and load that way (e.g. sql)
	df = pd.DataFrame()
	k = os.listdir("data")
	for i in k:
	print(i)
	if i.endswith(".pkl"):
	tmp = pd.read_pickle("data/"+i)
	tmp = tmp[tmp["SOURCES"].apply(lambda x: x in mySources)]
	df = pd.concat([df, tmp])


	#format the dataframe and select articles of interest
	df.DATE = df.DATE.apply(lambda x: str(x))
	df.DATE = pd.to_datetime(df.DATE)
	df.fillna("", inplace=True)
	df.set_index("DATE", drop=True, inplace=True)

	df["dprk"] = df["LOCATIONS"].apply(lambda x: x.find("North Korea") > -1)
	df["ukraine"] = df["LOCATIONS"].apply(lambda x: x.find("Ukraine") > -1)
	df["russia"] = df["LOCATIONS"].apply(lambda x: x.find("Russia") > -1)
	df["iran"] = df["LOCATIONS"].apply(lambda x: x.find("Iran") > -1)
	df["china"] = df["LOCATIONS"].apply(lambda x: x.find("China") > -1)

	loc_df = df.groupby(["SOURCES", "DATE"])[["dprk", "ukraine", "russia", "iran", "china"]].sum()

	mySources = ["nytimes.com", "washingtonpost.com", "foxnews.com", "cnn.com"]

	#create time_series dataframe which is going to have index of all the dates, then each column
	#is publisher + topic
	time_series = pd.DataFrame()
	for publisher in mySources:
	time_series = pd.concat([time_series, loc_df.ix[publisher].add_prefix("{}_".format(publisher))], axis=1)

	#visualize time series to make sure they loaded properly
	time_series.plot(figsize=(20,10), title="Articles Per Day Mentioning Various Countries by Publisher")

	#df to keep track of scoring
	mape_df = pd.DataFrame()

	#Run SARIMA model on old data; using mean absolute percentage error instead of root square error
	#this makes it easier to compare performance
	# see https://towardsdatascience.com/modeling-news-coverage-with-python-part-2-starting-news-forecasting-with-limited-data-5c1092de3ea9

	fig, axs = plt.subplots(4, 5, sharex=True)

	fig.set_size_inches(16,12)

	x = y = 0

	for issue in time_series:

	if issue.find(".com") < 0:
	continue

	train_l = len(time_series)-5

	selected_series = time_series[[col for col in time_series.columns if (col.find(issue[issue.find("_"):]) > -1)]]
	pub_series = time_series[[col for col in time_series.columns if (col.find(issue[:issue.find("_")]) > -1)]].drop(columns=issue)
	selected_series = selected_series.join(pub_series)

	s_model = SARIMAX(endog = selected_series[[issue]][:train_l][1:],
	exog = selected_series[[x for x in selected_series.columns if x != issue]][:train_l].shift().add_suffix("_l1")[1:],
	order=(3,1,1), seasonal_order=(1,0,1,7)).fit()


	f_ru = selected_series[[issue]].copy()[1:]
	f_ru.columns = ["actual"]
	f_ru["predicted"] = s_model.predict(end=datetime.datetime(2019, 10, 6), endog = selected_series[[issue]][-5:],exog = selected_series[[x for x in selected_series.columns if x != issue]].shift()[-5:],
	dynamic= False)

	testing = f_ru.copy()

	testing["error"] = np.abs((testing["actual"] - testing["predicted"]) / testing["actual"])
	fit = round(testing[testing["actual"] != 0].error.mean()*100)

	mape_df.loc[issue, "NewspapersOnly_model"] = fit

	testing2 = testing[-5:]
	fit_p = round(testing2[testing2["actual"] != 0].error.mean()*100)
	mape_df.loc[issue, "NewspapersOnly_predicted"] = fit_p



	f_ru["actual"].plot(title="{}\nMAPE: test: {}% model: {}%".format(issue, fit_p, fit), ax=axs[x,y])
	f_ru["predicted"][:-5].plot(color="orange", label="predicted: Train", ax=axs[x,y])
	f_ru["predicted"][-6:].plot(color="red", label="predicted: Test", ax=axs[x,y])

	x+=1
	if x > 3:
	x =0
	y+=1

	handles, labels = axs[0,0].get_legend_handles_labels()
	fig.legend(handles, labels, loc='center right')

	fig.suptitle("News forecast using within publication and external topic")