Skip to content

Instantly share code, notes, and snippets.

@projectyotta
Created November 4, 2017 18:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save projectyotta/a099cdb63c5a951eb6e81e32af2f88e9 to your computer and use it in GitHub Desktop.
Save projectyotta/a099cdb63c5a951eb6e81e32af2f88e9 to your computer and use it in GitHub Desktop.
# # make sure you have installed sklearn , pandas and quandl .
# # quandl is like a finance api thing .
# import pandas as pd
# import quandl
# import math
# import numpy as np
# from sklearn import preprocessing, cross_validation, svm
# from sklearn.linear_model import LinearRegression
# from numpy._distributor_init import NUMPY_MKL
# import matplotlib.pyplot as plt
# from matplotlib import style
# import datetime
# import time
# quandl.ApiConfig.api_key='6x-3xWJzAfcwXPpXWZs2'
# df = quandl.get('WIKI/GOOGL')
# # here , you get stuff like open , high , low etc . These are your features .
# # a feature is basically a column , in terms of excel .
# # we don't need all of these , so let's tone them down .
# df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume',]]
# df['HL_PCT'] = ((df['Adj. High'] - df['Adj. Close']) / df['Adj. Close'])*100
# df['PCT_change'] = ((df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'])*100
# # change the df to reflect the columns we need
# df = df[['Adj. Close', 'Adj. Volume', 'HL_PCT', 'PCT_change']]
# # an input you give is a feature , the variable u r trying to predict is ur label
# """
# an example I got from stackoverflow is this : if you are trying to predict what kind of pet
# a person would buy , the features will be salary , gender , age etc
# the label will be animal ( dog , cat , fish etc )
# """
# # let's do this for what we have
# """
# When inplace=True is passed, the data is renamed in place (it returns nothing)
# When inlace=False is passed (this is the default value, so isn't necessary), performs the operation and returns a copy of the object
# """
# forecast_col = 'Adj. Close'
# df.fillna(value=-99999,inplace=True)
# forecast_out = int(math.ceil(0.01*len(df)))
# df['label'] = df[forecast_col].shift(-forecast_out)
# # not entirely sure what df.dropna does here , but eh .
# df.dropna(inplace=True)
# # usually , X ( capital X ) is our features , y ( small y ) is our labels
# # put all inputs & outputs in two np arrays
# X = np.array(df.drop(['label'], 1))
# y = np.array(df['label'])
# # split into training and testing
# X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)
# X_lately = X[-forecast_out:]
# # try for linear regression
# clf = LinearRegression()
# clf.fit(X_train,y_train)
# accuracy = clf.score(X_test,y_test)
# print(accuracy)
# # try for regression in svm
# clf = svm.SVR()
# clf.fit(X_train,y_train)
# accuracy = clf.score(X_test,y_test)
# print(accuracy)
# # for k in ['linear','poly','rbf','sigmoid']:
# # clf = svm.SVR(kernel=k)
# # clf.fit(X_train, y_train)
# # confidence = clf.score(X_test, y_test)
# # print(k,confidence)
# forecast_set = clf.predict(X_lately)
# df['Forecast'] = np.nan
# last_date = df.iloc[-1].name
# last_unix = time.mktime(last_date.timetuple())
# one_day = 86400
# next_unix = last_unix + one_day
# for i in forecast_set:
# next_date = datetime.datetime.fromtimestamp(next_unix)
# next_unix += 86400
# df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)]+[i]
# df['Adj. Close'].plot()
# df['Forecast'].plot()
# plt.legend(loc=4)
# plt.xlabel('Date')
# plt.ylabel('Price')
# plt.show()
import quandl, math
import numpy as np
import pandas as pd
from sklearn import preprocessing, cross_validation, svm
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from matplotlib import style
import datetime
import time
quandl.ApiConfig.api_key='6x-3xWJzAfcwXPpXWZs2'
style.use('ggplot')
df = quandl.get("WIKI/GOOGL")
df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']]
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Close'] * 100.0
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0
df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']]
forecast_col = 'Adj. Close'
df.fillna(value=-99999, inplace=True)
forecast_out = int(math.ceil(0.01 * len(df)))
df['label'] = df[forecast_col].shift(-forecast_out)
X = np.array(df.drop(['label'], 1))
X = preprocessing.scale(X)
X_lately = X[-forecast_out:]
X = X[:-forecast_out]
df.dropna(inplace=True)
y = np.array(df['label'])
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)
clf = LinearRegression(n_jobs=-1)
clf.fit(X_train, y_train)
confidence = clf.score(X_test, y_test)
forecast_set = clf.predict(X_lately)
df['Forecast'] = np.nan
last_date = df.iloc[-1].name
last_unix = time.mktime(last_date.timetuple())
one_day = 86400
next_unix = last_unix + one_day
for i in forecast_set:
next_date = datetime.datetime.fromtimestamp(next_unix)
next_unix += 86400
df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)]+[i]
df['Adj. Close'].plot()
df['Forecast'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment