Andrew62/vectorized_regression.py Secret

## vectorized_regression.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time


def computeError(intercept, slope, x_data, y_data):
    # you originally were using x_train which was defined as a global
    # above. In general global variables are BAD
    # again, pandas and numpy will handle this broadcast
    y_pred = (intercept + (slope * x_data))
    sum_ = ((y_pred - y_data)**2).sum() # can do .sum here since we're using numpy/pandas
    return sum_/(len(x_data))


def costFunction(intercept, slope, y_train, x_train, learning_rate):

    #go through each value
    count = len(x_train)

    # this will broadcast across the dataframe
    y_pred = intercept + (slope * x_train)
    sum0 = ((y_pred - y_train) / count).sum()
    sum1 = (((y_pred - y_train) * x_train) / count).sum()

    temp0 = intercept - (learning_rate * sum0)
    temp1 = slope - (learning_rate * sum1)

    # don't need to wrap these in [] b/c the comma here defines a tuple
    return temp0, temp1


def graphResult(value0, value1, x_train, y_train):
    plt.title('Linear Regression')
    plt.xlabel('X Training Data')
    plt.ylabel('Y Training Data')
    plt.scatter(x_train,y_train)
    x = np.linspace(x_train.min(), x_train.max())
    y_pred = value0 + value1 * x
    plt.plot(x, y_pred)
    plt.show();


def main():
    # changed run() to main() b/c it's more conventional

    start = time.time()

    # This is the old data that didn't really have a linear correlation
    # need to specify no header otherwise you will use the first like of data as col headers
    d = pd.read_csv('data.csv', header=None)
    d.columns = ["X", "y"]
    x_train = d['X']
    y_train = d['y']


    value0 = 0
    value1 = 0
    learning_rate = 0.00001
    iterations = 1000

    print("Starting gradient descent linear regression with theta0: "
          + str(value0) + " theta1: " + str(value1))
    print("Working...")

    # for each number in iterations run the cost function
    # you need to have a loop here
    for i in range(iterations):
        value0, value1 = costFunction(value0, value1, y_train, x_train, learning_rate)

    graphResult(value0, value1, x_train, y_train)
    error = computeError(value0,value1,x_train,y_train)

    end = time.time()

    print("Finsihed with theta0: " + str(value0) +
          " theta1: " + str(value1) + " iterations: "
          + str(iterations) + " time elapsed: "
          + str(round(end-start, 3)) + "s"
          + " error: " + str(error))


# this construct ensures the main function is run when this file is run as
# the primary entry point
if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	# -- coding: utf-8 --

	import pandas as pd
	import matplotlib.pyplot as plt
	import numpy as np
	import time


	def computeError(intercept, slope, x_data, y_data):
	# you originally were using x_train which was defined as a global
	# above. In general global variables are BAD
	# again, pandas and numpy will handle this broadcast
	y_pred = (intercept + (slope * x_data))
	sum_ = ((y_pred - y_data)**2).sum() # can do .sum here since we're using numpy/pandas
	return sum_/(len(x_data))


	def costFunction(intercept, slope, y_train, x_train, learning_rate):

	#go through each value
	count = len(x_train)

	# this will broadcast across the dataframe
	y_pred = intercept + (slope * x_train)
	sum0 = ((y_pred - y_train) / count).sum()
	sum1 = (((y_pred - y_train) * x_train) / count).sum()

	temp0 = intercept - (learning_rate * sum0)
	temp1 = slope - (learning_rate * sum1)

	# don't need to wrap these in [] b/c the comma here defines a tuple
	return temp0, temp1


	def graphResult(value0, value1, x_train, y_train):
	plt.title('Linear Regression')
	plt.xlabel('X Training Data')
	plt.ylabel('Y Training Data')
	plt.scatter(x_train,y_train)
	x = np.linspace(x_train.min(), x_train.max())
	y_pred = value0 + value1 * x
	plt.plot(x, y_pred)
	plt.show();


	def main():
	# changed run() to main() b/c it's more conventional

	start = time.time()

	# This is the old data that didn't really have a linear correlation
	# need to specify no header otherwise you will use the first like of data as col headers
	d = pd.read_csv('data.csv', header=None)
	d.columns = ["X", "y"]
	x_train = d['X']
	y_train = d['y']


	value0 = 0
	value1 = 0
	learning_rate = 0.00001
	iterations = 1000

	print("Starting gradient descent linear regression with theta0: "
	+ str(value0) + " theta1: " + str(value1))
	print("Working...")

	# for each number in iterations run the cost function
	# you need to have a loop here
	for i in range(iterations):
	value0, value1 = costFunction(value0, value1, y_train, x_train, learning_rate)

	graphResult(value0, value1, x_train, y_train)
	error = computeError(value0,value1,x_train,y_train)

	end = time.time()

	print("Finsihed with theta0: " + str(value0) +
	" theta1: " + str(value1) + " iterations: "
	+ str(iterations) + " time elapsed: "
	+ str(round(end-start, 3)) + "s"
	+ " error: " + str(error))


	# this construct ensures the main function is run when this file is run as
	# the primary entry point
	if __name__ == "__main__":
	main()