Skip to content

Instantly share code, notes, and snippets.

@alejio
Created July 13, 2016 19:21
Show Gist options
  • Save alejio/5175dea0f3b3c7735f6acd51ffc94aee to your computer and use it in GitHub Desktop.
Save alejio/5175dea0f3b3c7735f6acd51ffc94aee to your computer and use it in GitHub Desktop.
Python, sklearn: Helper function for supervised learning
def supervised_learner(df, clf, train_list, testsize = 0.3, predictors=df.columns[1:], target=df.columns[-1]):
### Import packages
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
unique_pkeys = df.pkey.unique()
train_percent_reserved = np.true_divide(len(train_list), len(unique_pkeys))
X_train_prior = df.loc[df.pkey.isin(train_list), predictors]
y_train_prior = df.loc[df.pkey.isin(train_list), target]
testsize += train_percent_reserved
df_temp = df[-df.pkey.isin(train_list)]
X_train, X_test, y_train, y_test = train_test_split(df.temp[predictors], df.temp[target], test_size=testsize)
X_train = pd.concat(X_train, X_train_prior)
y_train = pd.concat(y_train, y_train_prior)
clf = clf
clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_train_scores = clf.predict_proba(X_train)[:, 1]
y_test_pred = clf.predict(X_test)
y_test_scores = clf.predict_proba(X_test)[:, 1]
train_classrep = classification_report(y_train, y_train_pred)
test_classrep = classification_report(y_test, y_test_pred)
train_confusion = confusion_matrix(y_train, y_train_pred)
test_confusion = confusion_matrix(y_test, y_test_pred)
train_fpr, train_tpr, train_thresholds = roc_curve(y_train, y_train_scores)
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, y_test_scores)
train_auc = roc_auc_score(y_train, y_train_scores)
test_auc = roc_auc_score(y_test, y_test_scores)
print '-------------Training----------------'
print train_classrep
print 'AUC score is %.2f' % train_auc
print 'ROC Curve'
plt.plot(train_fpr, train_tpr)
print '-------------Testing----------------'
print train_classrep
print 'AUC score is %.2f' % test_auc
print 'ROC Curve'
plt.plot(test_fpr, test_tpr)
df_coeffs = pd.DataFrame('predictor': X_train.columns, 'coefficient': clf.coef_[0]})
df_coeffs = df_coeffs.sort_values(by=['coefficient'], ascending=False)
return clf, df_coeffs
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment