Created
September 29, 2015 15:49
-
-
Save mwidjaja1/f658c6308e1d24e4819b to your computer and use it in GitHub Desktop.
By using Numpy & Sklearn, I was able to eliminate outliers and produce a more precise fit of net worth vs. age which is plotted by the blue line.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" outlier_packaged.py -------------------------------------------------------- | |
Goal: Importing two pickled data sets (with either the 'X' or 'Y' values), | |
we'll remove 10% of the outliers, which are those with the largest | |
error between the predictions made by our regression model vs. the | |
actual values. | |
Input: No traditional input argument but the user should put the | |
practice_outliers_ages.pkl & practice_outliers_net_worths.pkl (or | |
the pickled data set we're using) in the same folder as this file. | |
Output: 1. Returns the results of our regression line before & after the | |
outlier removal. This is the line's slope, line's y-intercept, | |
and line's regression score where '1' is the most optimal fit. | |
2. A plot of the data where the red line is before outlier removal, | |
blue line is after outlier removal, and blue points are the | |
actual data points. | |
-----------------------------------------------------------------------------""" | |
#!/usr/bin/python | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import pickle | |
from sklearn import linear_model | |
from sklearn.cross_validation import train_test_split | |
""" Function: getError --------------------------------------------------------- | |
Goal: This function gives us the error of the tuples we're sorting in | |
OutlierCleaner. | |
Input: 1. By being called as a 'key', we automatically give it the row | |
that we are trying to sort. | |
Output: Returns the error. | |
-----------------------------------------------------------------------------""" | |
def getError(item): | |
error = item[2] | |
return error | |
""" Function: OutlierCleaner --------------------------------------------------- | |
Goal: To clean away 10% of the points with the largest residual error | |
(difference between the prediction & actual net worth). We give it | |
all of the data points we have & it gives us 90% of the data points | |
with the smallest error. | |
Input: 1. predictions: All of the predictions | |
2. ages: All the 'x' value in our input data like ages | |
3. net_worths: All the 'y' value in our input like net_worth | |
Output: Returns a tuple named 'cleaned_data'. Each tuple is of the form | |
(age, net_worth, error) | |
-----------------------------------------------------------------------------""" | |
def outlierCleaner(predictions, ages, net_worths): | |
cleaned_data = [] | |
### Obtains Tuple of Age, Net_worth, Error | |
for idx, item in enumerate(net_worths): | |
# Calculates Error | |
error = predictions[idx] - item | |
# Creates Tuple | |
row = (int(ages[idx]), float(net_worths[idx]), np.abs(float(error))) | |
cleaned_data.append(row) | |
# Sorts & keeps best 90% of set | |
cleaned_data = sorted(cleaned_data, key=getError) | |
cleaned_data = cleaned_data[:int(round(len(cleaned_data)*.9))] | |
print cleaned_data | |
return cleaned_data | |
""" Step 1: Import & Setup Data ------------------------------------------------ | |
This step imports the Pickled data and reshapes them into 2D np Arrays. | |
-----------------------------------------------------------------------------""" | |
# Load up some practice data with outliers in it | |
ages = pickle.load( open("practice_outliers_ages.pkl", "r") ) | |
net_worths = pickle.load( open("practice_outliers_net_worths.pkl", "r") ) | |
# We reshape ages and net_worths into 2D numpy arrays | |
# Note: The 2nd argument of reshape command is a tuple of integers: | |
# (n_rows, n_columns) when n_rows = length/number of data points | |
# and n_columns is the number of features. | |
ages = np.reshape( np.array(ages), (len(ages), 1)) | |
net_worths = np.reshape( np.array(net_worths), (len(net_worths), 1)) | |
# We split the ages & net_worth arrays into a 'test' and 'train' dataset. | |
ages_train, ages_test, net_worths_train, net_worths_test = \ | |
train_test_split(ages, net_worths, test_size=0.1, random_state=42) | |
""" Step 2: Perform a Linear Regression ---------------------------------------- | |
This step creates a linear regression & we fit our training data set to it. | |
-----------------------------------------------------------------------------""" | |
# Creates a linear regression | |
reg = linear_model.LinearRegression() | |
# Fits our data to the regression | |
reg.fit(ages_train, net_worths_train) | |
""" Step 3: Reports & Plots data prior to outlier removal -------------------""" | |
# Statistics before removing outliers | |
print 'Before removing Outliers' | |
print ('Slope: %s') % reg.coef_ | |
print ('Y Intercept: %s') % reg.intercept_ | |
print ('Score: %s') % reg.score(ages_test, net_worths_test) | |
# Plots the data before removing outliers as a red line | |
try: | |
plt.plot(ages, reg.predict(ages), color="red") | |
except NameError: | |
pass | |
plt.scatter(ages, net_worths) | |
plt.show() | |
""" Step 4: Removes Outlier Points from Training Dataset ----------------------- | |
We predict what the training net_worths values would be when dervied from | |
the training age values under our regression model. We then compare the | |
values our regression produced vs. the real training net_worths values to | |
determine error and thus, which points are outliers which will be removed. | |
-----------------------------------------------------------------------------""" | |
cleaned_data = [] | |
try: | |
# Predicts what training net_worths would be using our regression model | |
predictions = reg.predict(ages_train) | |
# Compares our predictions with the real training net_worths and sends it | |
# to the function outlierCleaner to remove the outliers. | |
cleaned_data = outlierCleaner( predictions, ages_train, net_worths_train ) | |
except NameError: | |
print "your regression object doesn't exist, or isn't name reg" | |
print "can't make predictions to use in identifying outliers" | |
""" Step 5: Reports & Plots data after outlier removal ----------------------""" | |
# Only reshape ages & net_worths if cleaned_data is returning data | |
# Saves the (training) cleaned_data as ages & net_worths | |
if len(cleaned_data) > 0: | |
ages, net_worths, errors = zip(*cleaned_data) | |
ages = np.reshape( np.array(ages), (len(ages), 1)) | |
net_worths = np.reshape( np.array(net_worths), (len(net_worths), 1)) | |
# Refits our training cleaned_data to our model & plots a regression line | |
try: | |
reg.fit(ages, net_worths) | |
plt.plot(ages, reg.predict(ages), color="blue") | |
except NameError: | |
print "you don't seem to have regression imported/created," | |
print " or else your regression object isn't named reg" | |
print " either way, only draw the scatter plot of the cleaned data" | |
# Plots the training cleaned_data points as a scatterplot | |
plt.scatter(ages, net_worths) | |
plt.xlabel("ages") | |
plt.ylabel("net worths") | |
plt.show() | |
# Statistics after removing outliers | |
print 'After removing Outliers' | |
print ('Slope: %s') % reg.coef_ | |
print ('Y Intercept: %s') % reg.intercept_ | |
print ('Score: %s') % reg.score(ages_test, net_worths_test) | |
else: | |
print "outlierCleaner() is returning an empty list, no refitting to be done" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment