Created
November 4, 2017 18:43
-
-
Save projectyotta/a099cdb63c5a951eb6e81e32af2f88e9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# # make sure you have installed sklearn , pandas and quandl . | |
# # quandl is like a finance api thing . | |
# import pandas as pd | |
# import quandl | |
# import math | |
# import numpy as np | |
# from sklearn import preprocessing, cross_validation, svm | |
# from sklearn.linear_model import LinearRegression | |
# from numpy._distributor_init import NUMPY_MKL | |
# import matplotlib.pyplot as plt | |
# from matplotlib import style | |
# import datetime | |
# import time | |
# quandl.ApiConfig.api_key='6x-3xWJzAfcwXPpXWZs2' | |
# df = quandl.get('WIKI/GOOGL') | |
# # here , you get stuff like open , high , low etc . These are your features . | |
# # a feature is basically a column , in terms of excel . | |
# # we don't need all of these , so let's tone them down . | |
# df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume',]] | |
# df['HL_PCT'] = ((df['Adj. High'] - df['Adj. Close']) / df['Adj. Close'])*100 | |
# df['PCT_change'] = ((df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'])*100 | |
# # change the df to reflect the columns we need | |
# df = df[['Adj. Close', 'Adj. Volume', 'HL_PCT', 'PCT_change']] | |
# # an input you give is a feature , the variable u r trying to predict is ur label | |
# """ | |
# an example I got from stackoverflow is this : if you are trying to predict what kind of pet | |
# a person would buy , the features will be salary , gender , age etc | |
# the label will be animal ( dog , cat , fish etc ) | |
# """ | |
# # let's do this for what we have | |
# """ | |
# When inplace=True is passed, the data is renamed in place (it returns nothing) | |
# When inlace=False is passed (this is the default value, so isn't necessary), performs the operation and returns a copy of the object | |
# """ | |
# forecast_col = 'Adj. Close' | |
# df.fillna(value=-99999,inplace=True) | |
# forecast_out = int(math.ceil(0.01*len(df))) | |
# df['label'] = df[forecast_col].shift(-forecast_out) | |
# # not entirely sure what df.dropna does here , but eh . | |
# df.dropna(inplace=True) | |
# # usually , X ( capital X ) is our features , y ( small y ) is our labels | |
# # put all inputs & outputs in two np arrays | |
# X = np.array(df.drop(['label'], 1)) | |
# y = np.array(df['label']) | |
# # split into training and testing | |
# X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2) | |
# X_lately = X[-forecast_out:] | |
# # try for linear regression | |
# clf = LinearRegression() | |
# clf.fit(X_train,y_train) | |
# accuracy = clf.score(X_test,y_test) | |
# print(accuracy) | |
# # try for regression in svm | |
# clf = svm.SVR() | |
# clf.fit(X_train,y_train) | |
# accuracy = clf.score(X_test,y_test) | |
# print(accuracy) | |
# # for k in ['linear','poly','rbf','sigmoid']: | |
# # clf = svm.SVR(kernel=k) | |
# # clf.fit(X_train, y_train) | |
# # confidence = clf.score(X_test, y_test) | |
# # print(k,confidence) | |
# forecast_set = clf.predict(X_lately) | |
# df['Forecast'] = np.nan | |
# last_date = df.iloc[-1].name | |
# last_unix = time.mktime(last_date.timetuple()) | |
# one_day = 86400 | |
# next_unix = last_unix + one_day | |
# for i in forecast_set: | |
# next_date = datetime.datetime.fromtimestamp(next_unix) | |
# next_unix += 86400 | |
# df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)]+[i] | |
# df['Adj. Close'].plot() | |
# df['Forecast'].plot() | |
# plt.legend(loc=4) | |
# plt.xlabel('Date') | |
# plt.ylabel('Price') | |
# plt.show() | |
import quandl, math | |
import numpy as np | |
import pandas as pd | |
from sklearn import preprocessing, cross_validation, svm | |
from sklearn.linear_model import LinearRegression | |
import matplotlib.pyplot as plt | |
from matplotlib import style | |
import datetime | |
import time | |
quandl.ApiConfig.api_key='6x-3xWJzAfcwXPpXWZs2' | |
style.use('ggplot') | |
df = quandl.get("WIKI/GOOGL") | |
df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']] | |
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Close'] * 100.0 | |
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0 | |
df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']] | |
forecast_col = 'Adj. Close' | |
df.fillna(value=-99999, inplace=True) | |
forecast_out = int(math.ceil(0.01 * len(df))) | |
df['label'] = df[forecast_col].shift(-forecast_out) | |
X = np.array(df.drop(['label'], 1)) | |
X = preprocessing.scale(X) | |
X_lately = X[-forecast_out:] | |
X = X[:-forecast_out] | |
df.dropna(inplace=True) | |
y = np.array(df['label']) | |
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2) | |
clf = LinearRegression(n_jobs=-1) | |
clf.fit(X_train, y_train) | |
confidence = clf.score(X_test, y_test) | |
forecast_set = clf.predict(X_lately) | |
df['Forecast'] = np.nan | |
last_date = df.iloc[-1].name | |
last_unix = time.mktime(last_date.timetuple()) | |
one_day = 86400 | |
next_unix = last_unix + one_day | |
for i in forecast_set: | |
next_date = datetime.datetime.fromtimestamp(next_unix) | |
next_unix += 86400 | |
df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)]+[i] | |
df['Adj. Close'].plot() | |
df['Forecast'].plot() | |
plt.legend(loc=4) | |
plt.xlabel('Date') | |
plt.ylabel('Price') | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment