Skip to content

Instantly share code, notes, and snippets.

@brockmanmatt
brockmanmatt / gist:39c6a06834a9a154bd5c8ad0ee2ca5f3
Last active October 10, 2019 03:10
Medium Modeling News 2 - Check GDELT files exist
!pip install gdelt
import gdelt
gd = gdelt.gdelt(version=1)
import os, datetime
os.makedirs("data",exist_ok=True)
#starting 60 days before Oct7
cur_date = datetime.datetime(2019,10,7)-datetime.timedelta(days=60)
@brockmanmatt
brockmanmatt / narrowsources
Last active October 10, 2019 03:13
PullSelectNewsArticles
mySources = ["nytimes.com", "washingtonpost.com", "foxnews.com", "cnn.com"]
df = pd.DataFrame()
k = os.listdir("data")
for i in k:
print(i)
if i.endswith(".pkl"):
tmp = pd.read_pickle("data/"+i)
tmp = tmp[tmp["SOURCES"].apply(lambda x: x in mySources)]
df = pd.concat([df, tmp])
@brockmanmatt
brockmanmatt / shift_daily_article_counts.txt
Last active October 10, 2019 03:35
shift_daily_article_counts
def get_supervised_df(df, issue, maxlag=1):
original = df.copy()
#for each day of lag, we add a new column to the dataframe with the previous values
for i in range(maxlag):
print(i)
original= pd.concat([original, df.shift(i+1).add_suffix("_(t-%s)"%(i+2))], axis=1).dropna()
#save original issue to add back in
from fastai.tabular import *
def genNN(myDF, issue):
reframed = get_supervised_df(myDF, issue, 7)
days = []
myOrder = reframed.columns.to_list()
for i in range(7):
h_predictions, *_ = learn.get_preds(DatasetType.Test)
#h_predictions = h_predictions.numpy()
v_predictions, *_ = learn.get_preds(DatasetType.Valid)
#h_predictions = h_predictions.numpy()
predictions, *_ = learn.get_preds(DatasetType.Train)
#predictions = predictions.numpy()
issue = "foxnews.com_russia"
from sklearn.metrics import mean_squared_error
from math import sqrt
testing = results.copy()
print("RMSE training using mean: {}".format(sqrt(mean_squared_error(testing.actual, testing.m))))
print("RMSE training using model: {}".format(sqrt(mean_squared_error(testing.actual, testing.predicted))))
testing = results[-len(h_predictions)-1:].copy()
fig, axs = plt.subplots(4, 5, sharex=True)
fig.set_size_inches(16,12)
x = y = 0
for issue in time_series:
train_l = 55
s_model = SARIMAX(endog = time_series[[issue]][:train_l][1:],
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
## This assumes a jupyter notebook file to install gdelt
## if not using notebook, remove and run on the command line for your OS
!pip install gdelt
## (the ! gives command line capabilities to notebook)
import gdelt
myCountries = ["dprk", "ukraine", "russia", "iran", "china"]
fig, axs = plt.subplots(1,5, sharey=True, gridspec_kw={'wspace': 0})
fig.set_facecolor("white")
fig.set_size_inches(24,6)
idx = 0
for country in myCountries:
tmp = time_series[[x for x in time_series.columns if (x.find(country) > -1) ]]
#create a DF to hold errors
search_df = pd.DataFrame()
#set up the grid (1 row x 5 columns)
fig, axs = plt.subplots(1, 5, sharey=True, gridspec_kw={'wspace': 0})
fig.set_size_inches(16,6)
x = y = 0