This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fig, axs = plt.subplots(4, 5, figsize=(15,10), sharex=True) | |
x = y = 0 | |
test_length = 5 | |
for issue in time_series: | |
scores = pd.DataFrame(score_df) | |
scores = scores[scores.pub == issue] | |
scores[["n", "test", "train"]].plot(x="n", title="{}".format(issue), ax=axs[x,y], ylim=0) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
test_length = 5 | |
score_df = [] | |
for issue in time_series: | |
sma3 = time_series[[issue]].copy() | |
dates = sma3.index | |
for idx in range(len(time_series)-3): | |
sma3.loc[dates[-idx], issue] = np.round((sma3.loc[dates[-idx-1], issue] + sma3.loc[dates[-idx-2], issue] + sma3.loc[dates[-idx-3], issue])/3) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fig, axs = plt.subplots(4, 5, figsize=(15,10), sharex=True) | |
x = y = 0 | |
test_length = 5 | |
for issue in time_series: | |
sma3 = time_series[[issue]].copy() #set up lagged moving average 3 day | |
dates = sma3.index | |
for idx in range(len(time_series)-3): | |
sma3.loc[dates[-idx], issue] = np.round((sma3.loc[dates[-idx-1], issue] + sma3.loc[dates[-idx-2], issue] + sma3.loc[dates[-idx-3], issue])/3) | |
sma3.rename(columns={issue:"sma3"}, inplace=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.preprocessing import StandardScaler | |
from sklearn.decomposition import PCA | |
from sklearn.linear_model import LinearRegression | |
from sklearn.pipeline import Pipeline | |
def get_naive_pca_model(df, issue, test_length, n=.9): | |
print("doing {}".format(issue)) | |
myPCA = PCA(n) | |
pca = Pipeline([ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fig, axs = plt.subplots(4, 5, figsize=(20,12), sharex=True) | |
x = y = 0 | |
test_length=5 | |
for issue in time_series: | |
myModel = pd.DataFrame(model_dict[issue]).sort_values(by="aic")[:1].reset_index() | |
title="{} ({},{},{})\nMAPE: train {} test {}".format(issue, myModel.loc[0,"p"], myModel.loc[0, "d"], myModel.loc[0,"q"], myModel.loc[0, "train_mape"], myModel.loc[0, "test_mape"]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fig, axs = plt.subplots(4, 5, figsize=(20,12)) | |
x = y = 0 | |
for issue in time_series: | |
pd.DataFrame(model_dict[issue]).plot(x="aic", y="test_mape", style=".", title="{}".format(issue), ax=axs[x,y]) | |
pd.DataFrame(model_dict[issue]).plot(x="aic", y="train_mape", style=".", title="{}".format(issue), ax=axs[x,y]) | |
if x == 0 and y==0: | |
handles, labels = axs[0,0].get_legend_handles_labels() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model_dict = {} | |
for issue in time_series: | |
model_dict[issue]=[] | |
test_length = 5 | |
selected_series = time_series[[col for col in time_series.columns if (col.find(issue[issue.find("_"):]) > -1)]].shift()[1:].drop(columns=issue).add_suffix("_l1") | |
pub_series = time_series[[col for col in time_series.columns if (col.find(issue[:issue.find("_")]) > -1)]].drop(columns=issue).shift()[1:].add_suffix("_l1") | |
selected_series = selected_series.join(pub_series).join(time_series[issue]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fig, axs = plt.subplots(1,3, figsize=(12,8)) | |
issue = time_series.columns[0] | |
test_length = 10 | |
selected_series = time_series[[col for col in time_series.columns if (col.find(issue[issue.find("_"):]) > -1)]].shift()[1:].drop(columns=issue).add_suffix("_l1") | |
pub_series = time_series[[col for col in time_series.columns if (col.find(issue[:issue.find("_")]) > -1)]].drop(columns=issue).shift()[1:].add_suffix("_l1") | |
selected_series = selected_series.join(pub_series).join(time_series[issue]) | |
x = 0 | |
for p in [1,5,10]: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from statsmodels.tsa.statespace.sarimax import SARIMAX | |
def gen_SARIMA_result(p,d,q, df, issue, test_length): #takes pdq, data, issue to use | |
s_model = SARIMAX(endog = df[issue][:-test_length], | |
exog = df[[x for x in df.columns if x != issue]][:-test_length], | |
order=(p,d,q), seasonal_order=(1,0,1,7)).fit() | |
f_ru = df[[issue]].copy()[1:] #haven't bothered to change this, but it's the results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mySources = ["cnn.com", "washingtonpost.com", "nytimes.com", "foxnews.com"] | |
#unnest the entries with multiple sources in them | |
df = df.set_index(df.columns.drop('SOURCES',1).tolist()).SOURCES.str.split(';', expand=True).stack().reset_index().rename(columns={0:'SOURCES'}).loc[:, df.columns] | |
df.DATE = df.DATE.apply(lambda x: str(x)) #convert date | |
df.DATE = pd.to_datetime(df.DATE) | |
df.fillna("", inplace=True) | |
df.set_index("DATE", drop=True, inplace=True) |