-
-
Save tengpeng/a596f39bc0bfd3423c6e to your computer and use it in GitHub Desktop.
python benchmark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Amazon Access Challenge Starter Code | |
These files provide some starter code using | |
the scikit-learn library. It provides some examples on how | |
to design a simple algorithm, including pre-processing, | |
training a logistic regression classifier on the data, | |
assess its performance through cross-validation and some | |
pointers on where to go next. | |
Paul Duan <email@paulduan.com> | |
""" | |
from __future__ import division | |
import numpy as np | |
from sklearn import (metrics, cross_validation, linear_model, preprocessing) | |
from io_helper import (load_data, save_results) | |
SEED = 42 # always use a seed for randomized procedures | |
def load_data(filename, use_labels=True): | |
""" | |
Load data from CSV files and return them as numpy arrays | |
The use_labels parameter indicates whether one should | |
read the first column (containing class labels). If false, | |
return all 0s. | |
""" | |
# load column 1 to 8 (ignore last one) | |
data = np.loadtxt(open("data/" + filename), delimiter=',', | |
usecols=range(1, 9), skiprows=1) | |
if use_labels: | |
labels = np.loadtxt(open("data/" + filename), delimiter=',', | |
usecols=[0], skiprows=1) | |
else: | |
labels = np.zeros(data.shape[0]) | |
return labels, data | |
def save_results(predictions, filename): | |
"""Given a vector of predictions, save results in CSV format.""" | |
with open(filename, 'w') as f: | |
f.write("id,ACTION\n") | |
for i, pred in enumerate(predictions): | |
f.write("%d,%f\n" % (i + 1, pred)) | |
def main(): | |
""" | |
Fit models and make predictions. | |
We'll use one-hot encoding to transform our categorical features | |
into binary features. | |
y and X will be numpy array objects. | |
""" | |
model = linear_model.LogisticRegression(C=3) # the classifier we'll use | |
# === load data in memory === # | |
print "loading data" | |
y, X = load_data('train.csv') | |
y_test, X_test = load_data('test.csv', use_labels=False) | |
# === one-hot encoding === # | |
# we want to encode the category IDs encountered both in | |
# the training and the test set, so we fit the encoder on both | |
encoder = preprocessing.OneHotEncoder() | |
encoder.fit(np.vstack((X, X_test))) | |
X = encoder.transform(X) # Returns a sparse matrix (see numpy.sparse) | |
X_test = encoder.transform(X_test) | |
# if you want to create new features, you'll need to compute them | |
# before the encoding, and append them to your dataset after | |
# === training & metrics === # | |
mean_auc = 0.0 | |
n = 10 # repeat the CV procedure 10 times to get more precise results | |
for i in range(n): | |
# for each iteration, randomly hold out 20% of the data as CV set | |
X_train, X_cv, y_train, y_cv = cross_validation.train_test_split( | |
X, y, test_size=.20, random_state=i*SEED) | |
# if you want to perform feature selection / hyperparameter | |
# optimization, this is where you want to do it | |
# train model and make predictions | |
model.fit(X_train, y_train) | |
preds = model.predict_proba(X_cv)[:, 1] | |
# compute AUC metric for this CV fold | |
fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds) | |
roc_auc = metrics.auc(fpr, tpr) | |
print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc) | |
mean_auc += roc_auc | |
print "Mean AUC: %f" % (mean_auc/n) | |
# === Predictions === # | |
# When making predictions, retrain the model on the whole training set | |
model.fit(X, y) | |
preds = model.predict_proba(X_test)[:, 1] | |
filename = raw_input("Enter name for submission file: ") | |
save_results(preds, filename + ".csv") | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment