Skip to content

Instantly share code, notes, and snippets.

@dylanjf
Last active December 25, 2015 17:09
Show Gist options
  • Save dylanjf/7011219 to your computer and use it in GitHub Desktop.
Save dylanjf/7011219 to your computer and use it in GitHub Desktop.
3 rep 10 fold CV
########3 rep 10 fold CV to determine feature sparsity percentage via RFE#########
#X = concatenated text features for training set (title, body, url) transformed via TfIdfVectorizer
#y = training set classification (0, 1)
import numpy as np
import pandas as pd
import sklearn.linear_model as lm
from sklearn.cross_validation import KFold
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
#...code here to gen X and y
model = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001,
C=1, fit_intercept=True, intercept_scaling=1.0,
class_weight=None, random_state=None)
#generating fold indicies
seed = 42
mean_auc = [0] * 100
for i in range(1,4):
#initial CV split
cv_split = KFold(len(y), n_folds = 10, indices = True, random_state = seed*i)
#Within each fold:
#1) Obtain train / test splits for the fold
#2) Train LogReg model on entire train. record AUC on test set
#3) Obtain indicies for sorted abs() of the LogReg coefficients
#4) For 1-99%, train model on that amt of removed features from training. record AUC on test set
for train_index, test_index in cv_split:
X_cv_train, X_cv_test = X[train_index], X[test_index]
Y_cv_train, Y_cv_test = y[train_index], y[test_index]
log_fit = model.fit(X_cv_train, Y_cv_train)
coef = model.coef_.ravel(log_fit)
important_coef = np.argsort(np.abs(coef))
for j in range(len(mean_auc)):
important_coef_subset = important_coef[-int(len(important_coef) * (1 - j/100.0)):]
X_cv_train_subset, X_cv_test_subset = X_cv_train[:,important_coef_subset], X_cv_test[:,important_coef_subset]
log_fit = model.fit(X_cv_train_subset, Y_cv_train)
pred = model.predict_proba(X_cv_test_subset)[:,1]
mean_auc[j] += metrics.roc_auc_score(Y_cv_test, pred) / float(10)
print "Fold set %d complete." % i
mean_auc = [mean_auc[x] / 3 for x in range(len(mean_auc))]
best_pct = np.argsort(mean_auc)[-1]
plot(mean_auc)
best_pct
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment