Skip to content

Instantly share code, notes, and snippets.

@ddgromit
Created October 4, 2013 01:35
Show Gist options
  • Save ddgromit/6819739 to your computer and use it in GitHub Desktop.
Save ddgromit/6819739 to your computer and use it in GitHub Desktop.
First Classification submission using average miles per year and a RBF Support Vector Classifier
"""
Define the problem: Car is a good buy (0) or bad buy (1)
Type of problem: Classification problem
"""
# Plot values to see what features actually contribute to car values (histograms, scatter plots)
# Drop outliers in training data
# Clean some columns, e.g. year bought -> number of years owned
### IMPORT MODULES
import pandas as pd
from sklearn import tree, metrics, svm
import numpy as np
import matplotlib.pyplot as plt
def split_up(X,y,ratio=.7):
n = int(len(y) * ratio)
ind = np.hstack((np.ones(n, dtype=np.bool), np.zeros(len(y) - n, dtype=np.bool)))
np.random.shuffle(ind)
X_train, X_test = X[ind], X[ind == False]
y_train, y_test = y[ind], y[ind == False]
return X_train, y_train, X_test, y_test
### IMPORT DATA
l_train = pd.read_csv('lemon_training.csv')
l_final_test = pd.read_csv('lemon_test.csv')
### CLEAN DATA
for i,test_set in enumerate([l_train,l_final_test]):
test_set['MilesPerYear'] = test_set['VehOdo'] / test_set['VehicleAge']
# Which features do we want to use?
features = list(l_train.describe().columns)
features.remove('RefId')
features.remove('IsBadBuy')
features = ['MilesPerYear']
### TRANSFORM???
### CREATE TEST/TRAINING DATA SETS
# Grab just the features we want
X = l_train[features].dropna(axis=1).values
y = l_train.IsBadBuy.values
# Partition into test/training plots b/c we don't have real test data
training_X, training_y, test_X, test_y = split_up(X,y)
# Also create the 'official' test observations for our prediction submission
final_test_X = l_final_test[features].dropna(axis=1).values
### CLASSIFY WITH A MACHINE LEARNING ALGORITHM
print "Training..."
# clf = tree.DecisionTreeClassifier(max_depth=3).fit(training_X, training_y)
clf = svm.SVC(kernel='rbf', gamma=20).fit(training_X, training_y)
### SCORE IT
print "Predicting against training partition..."
y_train_pred = clf.predict(training_X)
print "Predicting against test partition..."
y_test_pred = clf.predict(test_X)
# Scoring
print "[Train] Score: ", clf.score(training_X, training_y) # this is going to be wayyy overfitted again
print "[Test] Score: ", clf.score(test_X, test_y) # this is going to be wayyy overfitted again
# Confusion matrix to see how closely it fit
print "[Train] Confusion matrix: "
print metrics.confusion_matrix(training_y, y_train_pred)
print "[Test] Confusion matrix: "
print metrics.confusion_matrix(test_y, y_test_pred)
# Precision and recall
print "[Train] Report"
print metrics.classification_report(training_y, y_train_pred)
print "[Test] Report"
print metrics.classification_report(test_y, y_test_pred)
### OUTPUT PREDICTIONS CSV
print "Predicting against final test set..."
test_predict = clf.predict(final_test_X)
df = l_final_test[['RefId']]
df['IsBadBuy'] = test_predict
df.to_csv('output.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment