Created
August 9, 2017 01:51
-
-
Save shak360/ab3aa9396f20774a1b3d93b2d9263bce to your computer and use it in GitHub Desktop.
Linear Regression in 2 Variables Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import matplotlib.lines as mlines | |
plt.style.use('bmh') | |
df = pd.read_csv('zillow.csv') # from http://people.sc.fsu.edu/~jburkardt/data/csv/ load into a dataframe | |
column_list = ['index', 'Living Space (square footage)', 'beds', 'baths', 'zip', 'year', 'Home Price (USD)'] # not a fan of the current header column, will change to my own labels | |
df.columns = column_list # changing the header with my names | |
xvarstring = 'Living Space (square footage)' # not a fan of hard-coding, so treat these as variables that can be changed if I want to change the variables, or the csv entirely | |
yvarstring = 'Home Price (USD)' | |
x_data = df.loc[:, xvarstring].tolist() # want to be able manipulate the data so I store it into a list | |
y_data = df.loc[:, yvarstring].tolist() | |
m = df.shape[0] # gives me the number of rows in the dataframe, will be the number of samples I've collected | |
x_totsum = sum(x_data) # summing everything in the column | |
y_totsum = sum(y_data) | |
x_average = x_totsum / m # getting the average by dividing the sum by how many there are | |
y_average = y_totsum / m | |
xy_data = [(x_data[i] * y_data[i]) for i in range(m)] # need this to calculate beta_2, gets the product of each pairwise x and y | |
xy_totsum = sum(xy_data) | |
xx_data = [(x_data[i]*x_data[i]) for i in range(m)] | |
xx_totsum = sum(xx_data) | |
xx_average = xx_totsum / m | |
beta_2 = ((xy_totsum) - (m * x_average * y_average)) / ((xx_totsum) - (m * (x_average)**2)) # the formula for these can be found in my first linear regression post | |
beta_1 = y_average - (beta_2 * x_average) | |
def linreg(a): # definining the function of the line, essentially, y = linreg(x) = beta_1 + beta_2 * x | |
return (beta_1 + (beta_2 * a)) | |
x_nparray = np.asarray(x_data) # matplotlib likes to work with np arrays instead of lists, so I'll convert them here | |
y_nparray = np.asarray(y_data) | |
x_maxestimate = x_nparray[m-1] + 0.5 * x_average # want somewhere for my line to end | |
t1 = np.arange(0,x_maxestimate) # need to create a continuous set of points for my line to act on | |
plt.figure(1) | |
plt.xlabel(xvarstring) | |
plt.ylabel(yvarstring) | |
plt.title('Linear Regression of {} against {}'.format(yvarstring, xvarstring)) | |
plt.grid(True) | |
blue_diamond = mlines.Line2D([], [], color='blue', marker='D', markersize=8, label='Observed Data') | |
plt.legend(handles=[blue_diamond], loc=4) | |
plt.plot(x_nparray, y_nparray, 'bD', x_nparray) | |
plt.figure(2) | |
plt.xlabel(xvarstring) | |
plt.ylabel(yvarstring) | |
plt.title('Linear Regression of {} against {}'.format(yvarstring, xvarstring)) | |
plt.grid(True) | |
blue_diamond = mlines.Line2D([], [], color='blue', marker='D', markersize=8, label='Observed Data') | |
black_dot = mlines.Line2D([], [], color='black', marker='o', markersize=8, label='Predicted Data') | |
plt.legend(handles=[blue_diamond, black_dot], loc=4) | |
plt.plot(x_nparray, y_nparray, 'bD', x_nparray, linreg(x_nparray), 'ko') | |
for k in range(m): | |
plt.annotate( | |
'', | |
xy=(x_nparray[k], linreg(x_nparray[k])), | |
xytext=(x_nparray[k], y_nparray[k]), | |
arrowprops=dict(facecolor='black', shrink=0), | |
) | |
plt.figure(3) | |
plt.xlabel(xvarstring) | |
plt.ylabel(yvarstring) | |
plt.title('Linear Regression of {} against {}'.format(yvarstring, xvarstring)) | |
plt.grid(True) | |
blue_diamond = mlines.Line2D([], [], color='blue', marker='D', markersize=8, label='Observed Data') | |
red_dash = mlines.Line2D([], [], color='red', label='Regression Line') | |
plt.legend(handles=[blue_diamond, red_dash], loc=4) | |
plt.plot(x_nparray, y_nparray, 'bD', t1, linreg(t1), 'r--', 0, linreg(0), 'ko', x_maxestimate, linreg(x_maxestimate), 'ko') | |
plt.annotate( | |
'{} square feet\n ${}'.format(0, linreg(0)), | |
xy=(0, linreg(0)), | |
xytext=(-250, linreg(0) - 32000), | |
) | |
plt.annotate( | |
'{} square feet\n ${}'.format(x_maxestimate, linreg(x_maxestimate)), | |
xy=(x_maxestimate, linreg(x_maxestimate)), | |
xytext=(x_maxestimate-250, linreg(x_maxestimate) + 15000), | |
) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment