Skip to content

Instantly share code, notes, and snippets.

@jmsword
Created January 31, 2017 15:31
Show Gist options
  • Save jmsword/49a59b13caaeacda12edb71cf70cdf7c to your computer and use it in GitHub Desktop.
Save jmsword/49a59b13caaeacda12edb71cf70cdf7c to your computer and use it in GitHub Desktop.
Over-fitting practice
import numpy as np
import statsmodels.formula.api as smf
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
#Set seed for reproducable results (what does this mean?)
np.random.seed(414)
#Generate toy data
X = np.linspace(0, 15, 1000)
y = 3 * np.sin(X) + np.random.normal(1 + X, .2, 1000)
#Training set to fit model
train_X, train_y = X[:700], y[:700]
test_X, test_y = X[700:], y[700:]
#Testing set to test model built off of training data
train_df = pd.DataFrame({'X': train_X, 'y': train_y})
test_df = pd.DataFrame({'X': test_X, 'y': test_y})
#Linear fit model
poly_1 = smf.ols(formula = 'y ~ 1 + X', data=train_df).fit()
#Quadratic fit model
poly_2 = smf.ols(formula = 'y ~ 1 + X + I(X**2)', data=train_df).fit()
#Run prediction on training data set
#Linear
y_train_1 = poly_1.predict(train_df)
#Quadratic
y_train_2 = poly_2.predict(train_df)
#Calculate error
#Linear
train_diff_1 = y_train_1 - train_y
#Quadratic
train_diff_2 = y_train_2 - train_y
#Calculate mean squared error
#Linear
train_mse_1 = sum((train_diff_1)**2)/len(train_y)
#Quadratic
train_mse_2 = sum((train_diff_2)**2)/len(train_y)
#Run prediction on testing data set
#Linear
y_test_1 = poly_1.predict(test_df)
#Quadratic
y_test_2 = poly_2.predict(test_df)
#Calculate error
#Linear
test_diff_1 = y_test_1 - test_y
#Quadratic
test_diff_2 = y_test_2 - test_y
#Calculate mean squared error
#Linear
test_mse_1 = sum((test_diff_1)**2)/len(test_y)
#Quadratic
test_mse_2 = sum((test_diff_2)**2)/len(test_y)
#Print training results:
print('Training set Linear MSE: ', train_mse_1)
print('Training set Quadratic MSE: ', train_mse_2)
#Print Training results:
print('Testing set Linear MSE: ', test_mse_1)
print('Testing set Quadratic MSE: ', test_mse_2)
#Plot linear fit
fig, ax = plt.subplots()
fig = sm.graphics.plot_fit(poly_1,1,ax=ax)
plt.show()
#Plot quadratic fit
fig, ax = plt.subplots()
fig = sm.graphics.plot_fit(poly_2,1,ax=ax)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment