Skip to content

Instantly share code, notes, and snippets.

@shak360
Created August 9, 2017 01:51
Show Gist options
  • Save shak360/ab3aa9396f20774a1b3d93b2d9263bce to your computer and use it in GitHub Desktop.
Save shak360/ab3aa9396f20774a1b3d93b2d9263bce to your computer and use it in GitHub Desktop.
Linear Regression in 2 Variables Python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
plt.style.use('bmh')
df = pd.read_csv('zillow.csv') # from http://people.sc.fsu.edu/~jburkardt/data/csv/ load into a dataframe
column_list = ['index', 'Living Space (square footage)', 'beds', 'baths', 'zip', 'year', 'Home Price (USD)'] # not a fan of the current header column, will change to my own labels
df.columns = column_list # changing the header with my names
xvarstring = 'Living Space (square footage)' # not a fan of hard-coding, so treat these as variables that can be changed if I want to change the variables, or the csv entirely
yvarstring = 'Home Price (USD)'
x_data = df.loc[:, xvarstring].tolist() # want to be able manipulate the data so I store it into a list
y_data = df.loc[:, yvarstring].tolist()
m = df.shape[0] # gives me the number of rows in the dataframe, will be the number of samples I've collected
x_totsum = sum(x_data) # summing everything in the column
y_totsum = sum(y_data)
x_average = x_totsum / m # getting the average by dividing the sum by how many there are
y_average = y_totsum / m
xy_data = [(x_data[i] * y_data[i]) for i in range(m)] # need this to calculate beta_2, gets the product of each pairwise x and y
xy_totsum = sum(xy_data)
xx_data = [(x_data[i]*x_data[i]) for i in range(m)]
xx_totsum = sum(xx_data)
xx_average = xx_totsum / m
beta_2 = ((xy_totsum) - (m * x_average * y_average)) / ((xx_totsum) - (m * (x_average)**2)) # the formula for these can be found in my first linear regression post
beta_1 = y_average - (beta_2 * x_average)
def linreg(a): # definining the function of the line, essentially, y = linreg(x) = beta_1 + beta_2 * x
return (beta_1 + (beta_2 * a))
x_nparray = np.asarray(x_data) # matplotlib likes to work with np arrays instead of lists, so I'll convert them here
y_nparray = np.asarray(y_data)
x_maxestimate = x_nparray[m-1] + 0.5 * x_average # want somewhere for my line to end
t1 = np.arange(0,x_maxestimate) # need to create a continuous set of points for my line to act on
plt.figure(1)
plt.xlabel(xvarstring)
plt.ylabel(yvarstring)
plt.title('Linear Regression of {} against {}'.format(yvarstring, xvarstring))
plt.grid(True)
blue_diamond = mlines.Line2D([], [], color='blue', marker='D', markersize=8, label='Observed Data')
plt.legend(handles=[blue_diamond], loc=4)
plt.plot(x_nparray, y_nparray, 'bD', x_nparray)
plt.figure(2)
plt.xlabel(xvarstring)
plt.ylabel(yvarstring)
plt.title('Linear Regression of {} against {}'.format(yvarstring, xvarstring))
plt.grid(True)
blue_diamond = mlines.Line2D([], [], color='blue', marker='D', markersize=8, label='Observed Data')
black_dot = mlines.Line2D([], [], color='black', marker='o', markersize=8, label='Predicted Data')
plt.legend(handles=[blue_diamond, black_dot], loc=4)
plt.plot(x_nparray, y_nparray, 'bD', x_nparray, linreg(x_nparray), 'ko')
for k in range(m):
plt.annotate(
'',
xy=(x_nparray[k], linreg(x_nparray[k])),
xytext=(x_nparray[k], y_nparray[k]),
arrowprops=dict(facecolor='black', shrink=0),
)
plt.figure(3)
plt.xlabel(xvarstring)
plt.ylabel(yvarstring)
plt.title('Linear Regression of {} against {}'.format(yvarstring, xvarstring))
plt.grid(True)
blue_diamond = mlines.Line2D([], [], color='blue', marker='D', markersize=8, label='Observed Data')
red_dash = mlines.Line2D([], [], color='red', label='Regression Line')
plt.legend(handles=[blue_diamond, red_dash], loc=4)
plt.plot(x_nparray, y_nparray, 'bD', t1, linreg(t1), 'r--', 0, linreg(0), 'ko', x_maxestimate, linreg(x_maxestimate), 'ko')
plt.annotate(
'{} square feet\n ${}'.format(0, linreg(0)),
xy=(0, linreg(0)),
xytext=(-250, linreg(0) - 32000),
)
plt.annotate(
'{} square feet\n ${}'.format(x_maxestimate, linreg(x_maxestimate)),
xy=(x_maxestimate, linreg(x_maxestimate)),
xytext=(x_maxestimate-250, linreg(x_maxestimate) + 15000),
)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment