Created
October 4, 2013 01:35
-
-
Save ddgromit/6819739 to your computer and use it in GitHub Desktop.
First Classification submission using average miles per year and a RBF Support Vector Classifier
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Define the problem: Car is a good buy (0) or bad buy (1) | |
Type of problem: Classification problem | |
""" | |
# Plot values to see what features actually contribute to car values (histograms, scatter plots) | |
# Drop outliers in training data | |
# Clean some columns, e.g. year bought -> number of years owned | |
### IMPORT MODULES | |
import pandas as pd | |
from sklearn import tree, metrics, svm | |
import numpy as np | |
import matplotlib.pyplot as plt | |
def split_up(X,y,ratio=.7): | |
n = int(len(y) * ratio) | |
ind = np.hstack((np.ones(n, dtype=np.bool), np.zeros(len(y) - n, dtype=np.bool))) | |
np.random.shuffle(ind) | |
X_train, X_test = X[ind], X[ind == False] | |
y_train, y_test = y[ind], y[ind == False] | |
return X_train, y_train, X_test, y_test | |
### IMPORT DATA | |
l_train = pd.read_csv('lemon_training.csv') | |
l_final_test = pd.read_csv('lemon_test.csv') | |
### CLEAN DATA | |
for i,test_set in enumerate([l_train,l_final_test]): | |
test_set['MilesPerYear'] = test_set['VehOdo'] / test_set['VehicleAge'] | |
# Which features do we want to use? | |
features = list(l_train.describe().columns) | |
features.remove('RefId') | |
features.remove('IsBadBuy') | |
features = ['MilesPerYear'] | |
### TRANSFORM??? | |
### CREATE TEST/TRAINING DATA SETS | |
# Grab just the features we want | |
X = l_train[features].dropna(axis=1).values | |
y = l_train.IsBadBuy.values | |
# Partition into test/training plots b/c we don't have real test data | |
training_X, training_y, test_X, test_y = split_up(X,y) | |
# Also create the 'official' test observations for our prediction submission | |
final_test_X = l_final_test[features].dropna(axis=1).values | |
### CLASSIFY WITH A MACHINE LEARNING ALGORITHM | |
print "Training..." | |
# clf = tree.DecisionTreeClassifier(max_depth=3).fit(training_X, training_y) | |
clf = svm.SVC(kernel='rbf', gamma=20).fit(training_X, training_y) | |
### SCORE IT | |
print "Predicting against training partition..." | |
y_train_pred = clf.predict(training_X) | |
print "Predicting against test partition..." | |
y_test_pred = clf.predict(test_X) | |
# Scoring | |
print "[Train] Score: ", clf.score(training_X, training_y) # this is going to be wayyy overfitted again | |
print "[Test] Score: ", clf.score(test_X, test_y) # this is going to be wayyy overfitted again | |
# Confusion matrix to see how closely it fit | |
print "[Train] Confusion matrix: " | |
print metrics.confusion_matrix(training_y, y_train_pred) | |
print "[Test] Confusion matrix: " | |
print metrics.confusion_matrix(test_y, y_test_pred) | |
# Precision and recall | |
print "[Train] Report" | |
print metrics.classification_report(training_y, y_train_pred) | |
print "[Test] Report" | |
print metrics.classification_report(test_y, y_test_pred) | |
### OUTPUT PREDICTIONS CSV | |
print "Predicting against final test set..." | |
test_predict = clf.predict(final_test_X) | |
df = l_final_test[['RefId']] | |
df['IsBadBuy'] = test_predict | |
df.to_csv('output.csv', index=False) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment