Skip to content

Instantly share code, notes, and snippets.

@gbersac
Created November 21, 2015 16:17
Show Gist options
  • Save gbersac/a16c2191d58c20de03a9 to your computer and use it in GitHub Desktop.
Save gbersac/a16c2191d58c20de03a9 to your computer and use it in GitHub Desktop.
# example from http://www.johnwittenauer.net/machine-learning-exercises-in-python-part-1/
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, sys
def computeCost(X, y, theta):
inner = np.power(((X * theta.T) - y), 2)
return np.sum(inner) / (2 * len(X))
def gradientDescent(X, y, theta, alpha, iters):
temp = np.matrix(np.zeros(theta.shape))
parameters = int(theta.ravel().shape[1])
cost = np.zeros(iters)
for i in range(iters):
error = (X * theta.T) - y
for j in range(parameters):
term = np.multiply(error, X[:,j])
temp[0,j] = theta[0,j] - ((alpha / len(X)) * np.sum(term))
theta = temp
cost[i] = computeCost(X, y, theta)
return theta, cost
def scale(to_scale):
minimum = to_scale.min()
maximum = to_scale.max()
return (to_scale - minimum) / (maximum - minimum)
def unscale(to_scale, original):
minimum = original.min()
maximum = original.max()
return (to_scale * (maximum - minimum)) + minimum
# set path
path = os.getcwd() + '/'
if len(sys.argv) > 1:
path += sys.argv[1]
else:
path += 'example.csv'
# get data
data = pd.read_csv(path)
data.insert(0, 'Ones', 1)
# set X (training data) and y (target variable)
cols = data.shape[1]
X = data.iloc[:,cols-2] #population
y = data.iloc[:,cols-1:cols] #prices
X = np.matrix(X.values)
y = np.matrix(y.values)
X_original = X
y_original = y
X = scale(X)
y = scale(y)
ones = np.ones(shape = (X.shape[0] + 1, X.shape[1]))
ones[:-1,:] = X
X = ones
# variables for gradient descent
alpha = 0.01
iters = 1000
theta = np.matrix(np.array([0,0]))
# compute gradient descent
X = X.T
theta_result, cost = gradientDescent(X, y, theta, alpha, iters)
# theta_result = unscale(theta_result, y_original)
print "Theta: ", theta_result
print "Initial cost: ", computeCost(X, y, theta)
print "End cost: ", computeCost(X, y, theta_result)
# plotting data
x = np.linspace(data.Population.min(), data.Population.max(), 100)
f = theta_result[0, 0] + (theta_result[0, 1] * x)
fig, ax = plt.subplots(figsize = (12, 8))
ax.plot(x, f, 'r', label = 'Prediction')
ax.scatter(data.Population, data.Profit, label = 'Traning Data')
ax.legend(loc = 2)
ax.set_xlabel('Population')
ax.set_ylabel('Profit')
ax.set_title('Predicted Profit vs. Population Size')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment