Skip to content

Instantly share code, notes, and snippets.

fig, axs = plt.subplots(4, 5, sharex=True)
fig.set_size_inches(16,12)
x = y = 0
for issue in time_series:
train_l = len(time_series)-5
selected_series = time_series[[col for col in time_series.columns if (col.find(issue[:issue.find("_")]) > -1)]]
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
## This assumes a jupyter notebook file to install gdelt
## if not using notebook, remove and run on the command line for your OS
!pip install gdelt
## (the ! gives command line capabilities to notebook)
import gdelt
fig, axs = plt.subplots(4, 5, sharex=True)
fig.set_size_inches(16,12)
x = y = 0
for issue in time_series:
if not issue.find(".com") > -1:
continue
mape_df = pd.DataFrame()
fig, axs = plt.subplots(4, 5, sharex=True)
fig.set_size_inches(16,12)
x = y = 0
for issue in time_series:
fig, axs = plt.subplots(4, 5, sharex=True)
fig.set_size_inches(16,12)
x = y = 0
for issue in time_series:
if not issue.find(".com") > -1:
continue
train_l = len(time_series)-5
mySources = ["cnn.com", "washingtonpost.com", "nytimes.com", "foxnews.com"]
#unnest the entries with multiple sources in them
df = df.set_index(df.columns.drop('SOURCES',1).tolist()).SOURCES.str.split(';', expand=True).stack().reset_index().rename(columns={0:'SOURCES'}).loc[:, df.columns]
df.DATE = df.DATE.apply(lambda x: str(x)) #convert date
df.DATE = pd.to_datetime(df.DATE)
df.fillna("", inplace=True)
df.set_index("DATE", drop=True, inplace=True)
from statsmodels.tsa.statespace.sarimax import SARIMAX
def gen_SARIMA_result(p,d,q, df, issue, test_length): #takes pdq, data, issue to use
s_model = SARIMAX(endog = df[issue][:-test_length],
exog = df[[x for x in df.columns if x != issue]][:-test_length],
order=(p,d,q), seasonal_order=(1,0,1,7)).fit()
f_ru = df[[issue]].copy()[1:] #haven't bothered to change this, but it's the results
fig, axs = plt.subplots(1,3, figsize=(12,8))
issue = time_series.columns[0]
test_length = 10
selected_series = time_series[[col for col in time_series.columns if (col.find(issue[issue.find("_"):]) > -1)]].shift()[1:].drop(columns=issue).add_suffix("_l1")
pub_series = time_series[[col for col in time_series.columns if (col.find(issue[:issue.find("_")]) > -1)]].drop(columns=issue).shift()[1:].add_suffix("_l1")
selected_series = selected_series.join(pub_series).join(time_series[issue])
x = 0
for p in [1,5,10]:
model_dict = {}
for issue in time_series:
model_dict[issue]=[]
test_length = 5
selected_series = time_series[[col for col in time_series.columns if (col.find(issue[issue.find("_"):]) > -1)]].shift()[1:].drop(columns=issue).add_suffix("_l1")
pub_series = time_series[[col for col in time_series.columns if (col.find(issue[:issue.find("_")]) > -1)]].drop(columns=issue).shift()[1:].add_suffix("_l1")
selected_series = selected_series.join(pub_series).join(time_series[issue])
fig, axs = plt.subplots(4, 5, figsize=(20,12))
x = y = 0
for issue in time_series:
pd.DataFrame(model_dict[issue]).plot(x="aic", y="test_mape", style=".", title="{}".format(issue), ax=axs[x,y])
pd.DataFrame(model_dict[issue]).plot(x="aic", y="train_mape", style=".", title="{}".format(issue), ax=axs[x,y])
if x == 0 and y==0:
handles, labels = axs[0,0].get_legend_handles_labels()