Skip to content

Instantly share code, notes, and snippets.

@wermarter
Created January 22, 2017 09:41
Show Gist options
  • Save wermarter/b9c36b5421ea1ba1cec7ff8d99c27e63 to your computer and use it in GitHub Desktop.
Save wermarter/b9c36b5421ea1ba1cec7ff8d99c27e63 to your computer and use it in GitHub Desktop.
Pickle vs. Joblib, some ML with update features, DF, predict GOOGL from Quandl
import quandl, pickle, math, datetime
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing, cross_validation
from sklearn.externals import joblib
import matplotlib.pyplot as plt
from matplotlib import style
one_day = 86400
one_week = one_day*7
# READ DATA
try:
df = pickle.load(open('df.pkl', 'rb'))
if datetime.datetime.now().timestamp()-df.iloc[-1].name.timestamp()>one_week:
raise
except:
print('Getting new data from Quandl...')
df = quandl.get('WIKI/GOOGL', api_key='GxHxjaX1EiVzszDc1CQt')
df['HL_PCT'] = (df['Adj. High']-df['Adj. Low'])/df['Adj. Close']*100.0
df['PCT_Change'] = (df['Adj. Close']-df['Adj. Open'])/df['Adj. Open']*100.0
df = df[['Adj. Close', 'HL_PCT', 'PCT_Change', 'Adj. Volume']]
df.fillna(value=-99999, inplace=True)
pickle.dump(df, open('df.pkl', 'wb'))
# BACKUP DF
df_orig = pd.DataFrame()
df_orig = df_orig.append(df)
# SETTING
forecast_col = 'Adj. Close'
forecast_out = math.ceil(0.01*len(df))
X = np.array(df)
X = preprocessing.scale(X)
X_predict = X[-forecast_out:]
X = X[:-forecast_out]
df['label'] = df[forecast_col].shift(-forecast_out)
df.dropna(inplace=True)
Y = np.array(df['label'])
# DO_IT
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2)
confidence = 0
try:
clf = joblib.load('clf.pkl')
confidence = clf.score(X_test, Y_test)
if confidence < 0.9:
raise
except:
print('Re-fitting the line...')
clf = LinearRegression()
clf.fit(X_train, Y_train)
joblib.dump(clf, 'clf.pkl')
confidence = clf.score(X_test, Y_test) if confidence==0 else confidence
forecast_set = clf.predict(X_predict)
# Prepare for VISUAL
df = df_orig
df['Forecast'] = np.nan
last_date = df.iloc[-1]
last_date['Forecast'] = last_date['Adj. Close']
df.iloc[-1] = last_date
next_unix = df.iloc[-1].name.timestamp() + one_day
for i in forecast_set:
next_date = datetime.datetime.fromtimestamp(next_unix)
df.loc[next_date] = [np.nan]*(len(df.columns)-1)+[i]
next_unix += one_day
# PLOT_IT
style.use('ggplot')
df['Adj. Close'].plot()
df['Forecast'].plot()
plt.legend(loc=4) # Locate legend in IV part
plt.xlabel('Time')
plt.ylabel('Price')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment