Instantly share code, notes, and snippets.

# reddragon/linear_regression.py

Created April 5, 2017 08:31
Show Gist options
• Save reddragon/91c023a123b8aab4c200183173e583fc to your computer and use it in GitHub Desktop.
Linear Regression in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 ''' Linear Regression From First Principles Author: Gaurav Menghani (gaurav.menghani@gmail.com) ''' import numpy as np import matplotlib.pyplot as plt def linear_sum(X, W, b): return X.dot(W) + b def data_gen(num_rows, num_feats, op_gen_fn=linear_sum): data = {} W = 0.1 * np.random.randn(num_feats) b = 0.1 * np.random.randn() data['X'] = np.random.randn(num_rows, num_feats) data['y'] = op_gen_fn(data['X'], W, b) return data class LinearRegression(object): def __init__(self): self.W = None self.b = 0 def init_matrix(self, X): self.W = 0.1 * np.random.randn(X.shape[1]) self.b = 0.1 * np.random.randn() def predict(self, X): if self.W is None: self.init_matrix(X) return X.dot(self.W) + self.b def diff(self, pred, y): return pred-y def l1_loss(self, pred, y): return reduce(lambda a,b: a+b, self.diff(pred, y)) * 1.0 / len(y) def l2_loss(self, pred, y): return reduce(lambda a,b: a+b*b, self.diff(pred, y)) * 1.0 / len(y) def loss(self, pred, y): return self.l2_loss(pred, y) def train(self, data, iters, lr): X = data['X'] y = data['y'] orig_pred = self.predict(X) orig_l1 = self.l1_loss(orig_pred, y) orig_l2 = self.l2_loss(orig_pred, y) print_every = 50 l1 = orig_l1 l2 = orig_l2 l1_losses = [] l2_losses = [] l1_losses.append(l1) l2_losses.append(l2) for it in range(iters): pred = self.predict(X) s1 = (pred - y) s2 = np.multiply(X, np.repeat(s1, X.shape[1])\ .reshape(X.shape[0], X.shape[1])) wGrad = np.sum(s2, axis=0) / X.shape[0] bGrad = np.sum(s1) * 1.0 / X.shape[0] wDelta = -lr * wGrad bDelta = -lr * bGrad self.W += wDelta self.b += bDelta l1 = self.l1_loss(pred, y) l2 = self.l2_loss(pred, y) l1_losses.append(l1) l2_losses.append(l2) if it % print_every == 0: print 'Iteration: ' + str(it) print 'L1 loss: %Lf, L2 loss: %Lf' % (l1, l2) print '---' print '\n====' print 'Original L1 loss: %Lf, L2 loss: %Lf ' % (orig_l1, orig_l2) print 'Final L1 loss: %Lf, L2 loss: %Lf' % (l1, l2) print 'Estimated Params: ' print '- b: ', self.b print '- W: ', self.W print '====\n' fig = plt.figure() ax = fig.add_subplot(111) ax.set_title('Linear Regression Over %d Variables' % X.shape[1]) ax.set_xlabel('Iterations') ax.set_ylabel('L2 Loss') plt.gca().set_ylim(ymin=0) plt.plot(l2_losses, label='lr') plt.show() np.random.seed(2) data = data_gen(100, 5) # Persist training data data_file = open('lin_reg_data', 'w') for xd,yd in zip(data['X'], data['y']): line = str(yd) + ',' + ','.join(str(x) for x in xd) + "\n" data_file.write(line) data_file.close() regression = LinearRegression() predictions = regression.predict(data['X']) loss = regression.loss(predictions, data['y']) regression.train(data, iters=1000, lr=0.01)

### reddragon commented Apr 5, 2017

Validating the solution in R:

``````> df <- read.delim(file="lin_reg_data", header=F, sep=",")
> lm(df[,1] ~ df[,2]+df[,3]+df[,4]+df[,5]+df[,6])

Call:
lm(formula = df[, 1] ~ df[, 2] + df[, 3] + df[, 4] + df[, 5] +
df[, 6])

Coefficients:
(Intercept)      df[, 2]      df[, 3]      df[, 4]      df[, 5]      df[, 6]
-0.084175    -0.041676    -0.005627    -0.213620     0.164027    -0.179344
``````