projectyotta/regression_sentdex_part1.py

## regression_sentdex_part1.py
# # make sure you have installed sklearn , pandas and quandl .
# # quandl is like a finance api thing .

# import pandas as pd
# import quandl
# import math
# import numpy as np
# from sklearn import preprocessing, cross_validation, svm
# from sklearn.linear_model import LinearRegression
# from numpy._distributor_init import NUMPY_MKL
# import matplotlib.pyplot as plt
# from matplotlib import style
# import datetime
# import time


# quandl.ApiConfig.api_key='6x-3xWJzAfcwXPpXWZs2'
# df = quandl.get('WIKI/GOOGL')
# # here , you get stuff like open , high , low etc . These are your features .
# # a feature is basically a column , in terms of excel .

# # we don't need all of these , so let's tone them down .

# df = df[['Adj. Open',  'Adj. High',  'Adj. Low',  'Adj. Close', 'Adj. Volume',]]
# df['HL_PCT'] = ((df['Adj. High'] - df['Adj. Close']) / df['Adj. Close'])*100
# df['PCT_change'] = ((df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'])*100

# # change the df to reflect the columns we need
# df = df[['Adj. Close', 'Adj. Volume', 'HL_PCT', 'PCT_change']]

# # an input you give is a feature , the variable u r trying to predict is ur label

# """
# an example I got from stackoverflow is this : if you are trying to predict what kind of pet
# a person would buy , the features will be salary , gender , age etc
# the label will be animal ( dog , cat , fish  etc )
# """
# # let's do this for what we have

# """
# When inplace=True is passed, the data is renamed in place (it returns nothing)
# When inlace=False is passed (this is the default value, so isn't necessary), performs the operation and returns a copy of the object

# """
# forecast_col = 'Adj. Close'
# df.fillna(value=-99999,inplace=True)

# forecast_out = int(math.ceil(0.01*len(df)))
# df['label'] = df[forecast_col].shift(-forecast_out)
# # not entirely sure what df.dropna does here , but eh .
# df.dropna(inplace=True)

# # usually , X ( capital X ) is our features , y ( small y ) is our labels
# # put all inputs & outputs in two np arrays
# X = np.array(df.drop(['label'], 1))

# y = np.array(df['label'])

# # split into training and testing

# X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)
# X_lately = X[-forecast_out:]

# # try for linear regression
# clf = LinearRegression()
# clf.fit(X_train,y_train)
# accuracy = clf.score(X_test,y_test)
# print(accuracy)


# # try for regression in svm
# clf = svm.SVR()
# clf.fit(X_train,y_train)
# accuracy = clf.score(X_test,y_test)
# print(accuracy)


# # for k in ['linear','poly','rbf','sigmoid']:
# #     clf = svm.SVR(kernel=k)
# #     clf.fit(X_train, y_train)
# #     confidence = clf.score(X_test, y_test)
# #     print(k,confidence)


# forecast_set = clf.predict(X_lately)
# df['Forecast'] = np.nan

# last_date = df.iloc[-1].name
# last_unix = time.mktime(last_date.timetuple())
# one_day = 86400
# next_unix = last_unix + one_day

# for i in forecast_set:
#     next_date = datetime.datetime.fromtimestamp(next_unix)
#     next_unix += 86400
#     df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)]+[i]

# df['Adj. Close'].plot()
# df['Forecast'].plot()
# plt.legend(loc=4)
# plt.xlabel('Date')
# plt.ylabel('Price')
# plt.show()

import quandl, math
import numpy as np
import pandas as pd
from sklearn import preprocessing, cross_validation, svm
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from matplotlib import style
import datetime
import time
quandl.ApiConfig.api_key='6x-3xWJzAfcwXPpXWZs2'
style.use('ggplot')

df = quandl.get("WIKI/GOOGL")
df = df[['Adj. Open',  'Adj. High',  'Adj. Low',  'Adj. Close', 'Adj. Volume']]
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Close'] * 100.0
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0

df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']]
forecast_col = 'Adj. Close'
df.fillna(value=-99999, inplace=True)
forecast_out = int(math.ceil(0.01 * len(df)))
df['label'] = df[forecast_col].shift(-forecast_out)

X = np.array(df.drop(['label'], 1))
X = preprocessing.scale(X)
X_lately = X[-forecast_out:]
X = X[:-forecast_out]

df.dropna(inplace=True)

y = np.array(df['label'])

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)
clf = LinearRegression(n_jobs=-1)
clf.fit(X_train, y_train)
confidence = clf.score(X_test, y_test)

forecast_set = clf.predict(X_lately)
df['Forecast'] = np.nan

last_date = df.iloc[-1].name
last_unix = time.mktime(last_date.timetuple())
one_day = 86400
next_unix = last_unix + one_day

for i in forecast_set:
    next_date = datetime.datetime.fromtimestamp(next_unix)
    next_unix += 86400
    df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)]+[i]

df['Adj. Close'].plot()
df['Forecast'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()
	# # make sure you have installed sklearn , pandas and quandl .
	# # quandl is like a finance api thing .

	# import pandas as pd
	# import quandl
	# import math
	# import numpy as np
	# from sklearn import preprocessing, cross_validation, svm
	# from sklearn.linear_model import LinearRegression
	# from numpy._distributor_init import NUMPY_MKL
	# import matplotlib.pyplot as plt
	# from matplotlib import style
	# import datetime
	# import time



	# quandl.ApiConfig.api_key='6x-3xWJzAfcwXPpXWZs2'
	# df = quandl.get('WIKI/GOOGL')
	# # here , you get stuff like open , high , low etc . These are your features .
	# # a feature is basically a column , in terms of excel .

	# # we don't need all of these , so let's tone them down .

	# df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume',]]
	# df['HL_PCT'] = ((df['Adj. High'] - df['Adj. Close']) / df['Adj. Close'])*100
	# df['PCT_change'] = ((df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'])*100

	# # change the df to reflect the columns we need
	# df = df[['Adj. Close', 'Adj. Volume', 'HL_PCT', 'PCT_change']]

	# # an input you give is a feature , the variable u r trying to predict is ur label

	# """
	# an example I got from stackoverflow is this : if you are trying to predict what kind of pet
	# a person would buy , the features will be salary , gender , age etc
	# the label will be animal ( dog , cat , fish etc )
	# """
	# # let's do this for what we have

	# """
	# When inplace=True is passed, the data is renamed in place (it returns nothing)
	# When inlace=False is passed (this is the default value, so isn't necessary), performs the operation and returns a copy of the object

	# """
	# forecast_col = 'Adj. Close'
	# df.fillna(value=-99999,inplace=True)

	# forecast_out = int(math.ceil(0.01*len(df)))
	# df['label'] = df[forecast_col].shift(-forecast_out)
	# # not entirely sure what df.dropna does here , but eh .
	# df.dropna(inplace=True)

	# # usually , X ( capital X ) is our features , y ( small y ) is our labels
	# # put all inputs & outputs in two np arrays
	# X = np.array(df.drop(['label'], 1))

	# y = np.array(df['label'])

	# # split into training and testing

	# X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)
	# X_lately = X[-forecast_out:]

	# # try for linear regression
	# clf = LinearRegression()
	# clf.fit(X_train,y_train)
	# accuracy = clf.score(X_test,y_test)
	# print(accuracy)



	# # try for regression in svm
	# clf = svm.SVR()
	# clf.fit(X_train,y_train)
	# accuracy = clf.score(X_test,y_test)
	# print(accuracy)





	# # for k in ['linear','poly','rbf','sigmoid']:
	# # clf = svm.SVR(kernel=k)
	# # clf.fit(X_train, y_train)
	# # confidence = clf.score(X_test, y_test)
	# # print(k,confidence)





	# forecast_set = clf.predict(X_lately)
	# df['Forecast'] = np.nan

	# last_date = df.iloc[-1].name
	# last_unix = time.mktime(last_date.timetuple())
	# one_day = 86400
	# next_unix = last_unix + one_day

	# for i in forecast_set:
	# next_date = datetime.datetime.fromtimestamp(next_unix)
	# next_unix += 86400
	# df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)]+[i]

	# df['Adj. Close'].plot()
	# df['Forecast'].plot()
	# plt.legend(loc=4)
	# plt.xlabel('Date')
	# plt.ylabel('Price')
	# plt.show()

	import quandl, math
	import numpy as np
	import pandas as pd
	from sklearn import preprocessing, cross_validation, svm
	from sklearn.linear_model import LinearRegression
	import matplotlib.pyplot as plt
	from matplotlib import style
	import datetime
	import time
	quandl.ApiConfig.api_key='6x-3xWJzAfcwXPpXWZs2'
	style.use('ggplot')

	df = quandl.get("WIKI/GOOGL")
	df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']]
	df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Close'] * 100.0
	df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0

	df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']]
	forecast_col = 'Adj. Close'
	df.fillna(value=-99999, inplace=True)
	forecast_out = int(math.ceil(0.01 * len(df)))
	df['label'] = df[forecast_col].shift(-forecast_out)

	X = np.array(df.drop(['label'], 1))
	X = preprocessing.scale(X)
	X_lately = X[-forecast_out:]
	X = X[:-forecast_out]

	df.dropna(inplace=True)

	y = np.array(df['label'])

	X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)
	clf = LinearRegression(n_jobs=-1)
	clf.fit(X_train, y_train)
	confidence = clf.score(X_test, y_test)

	forecast_set = clf.predict(X_lately)
	df['Forecast'] = np.nan

	last_date = df.iloc[-1].name
	last_unix = time.mktime(last_date.timetuple())
	one_day = 86400
	next_unix = last_unix + one_day

	for i in forecast_set:
	next_date = datetime.datetime.fromtimestamp(next_unix)
	next_unix += 86400
	df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)]+[i]

	df['Adj. Close'].plot()
	df['Forecast'].plot()
	plt.legend(loc=4)
	plt.xlabel('Date')
	plt.ylabel('Price')
	plt.show()