Skip to content

Instantly share code, notes, and snippets.

@erogol
Last active February 8, 2018 20:28
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save erogol/7267900 to your computer and use it in GitHub Desktop.
Save erogol/7267900 to your computer and use it in GitHub Desktop.
logistic regression ensembles with feature selection. It requires sklearn python lib
def linear_model_ensemble(X, y, X_test, fold_num, fold_num_sec, grid_search_range, oobe=True, x_val=True ):
'''
X - Train set
y - Train set labels with. Labels are 1 for pos instances and -1 for neg instances
fold_num1 - Fold size for the first step X-validation to set the hyper-params
and feature selectors
fold_num2 - Fold size for the second step X-validation to test the generalization
performance of the ensemble
grid_search_range - Given list of values to be used as a C candidates in grid search
oobe - Use oobe values for the prediction
x-val - Boolean value for whether use second step X-val or not
'''
# Import Necessary Modules
from sklearn.cross_validation import KFold, StratifiedKFold
from sklearn.metrics import classification_report,roc_auc_score,accuracy_score
import sklearn.linear_model as lm
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2
rd = lm.LogisticRegression(dual=True, tol=1e-5,
fit_intercept=True, intercept_scaling=1.0,
class_weight=None, random_state=None)
# Training and feature selection
num_feats = X.shape[1]
lentrain = X.shape[0]
scores = np.zeros((0,)) # list of scores of classifers by test folds
clfs = [] # list of classifers trained by each train fold.
feat_selects = [] # list of feature selectors
kf = KFold(lentrain, n_folds=fold_num, indices=True)
'''
First X-val iteration. Each iteration trains a different classifer whose
parameters are optimized and discriminative features are selected
for the given partition of examples.
'''
for train, test in kf:
# set data folds
train_fold, test_fold, train_y, test_y = X[train], X[test], y[train], y[test]
# Feature selection
feat_select = SelectPercentile(score_func=chi2, percentile=16).fit(train_fold,train_y.astype(float))
feat_selects.append(feat_select)
train_fold = feat_select.transform(train_fold)
test_fold = feat_select.transform(test_fold)
tuned_parameters = [{'C': grid_search_range }]
# Hyper parameter optimization
rd_fitte, score = find_best_parameters(train_fold,train_y,test_fold, test_y,rd,tuned_parameters)
clfs.append(rd_fitte)
scores = np.append(scores,score)
'''
Next step X-val to see the generalization performance of the ensemble
'''
if x_val:
pred_vals = []
skf = StratifiedKFold(y, n_folds=fold_num_sec, indices=True)
clf_scores = np.array(())
for train, test in skf:
train_fold, test_fold, train_y, test_y = X[train], X[test], y[train], y[test]
for counter,clf in enumerate(clfs):
test_fold_transed = feat_selects[counter].transform(test_fold)
if oobe == True:
pred_val = clf.predict(test_fold_transed)*scores[counter]
else:
pred_val = clf.predict(test_fold_transed)
if counter == 0:
pred_vals = pred_val
else:
pred_vals = pred_vals+pred_val
# Compute current fold's prediction score
pred = pred_vals/len(clfs)
clf_score = roc_auc_score(test_y.astype(float),pred)
clf_scores = np.append(clf_scores,clf_score)
# validation result
print "Final X-val result",clf_scores.mean()
'''
Full Training Time
'''
print "training on full data"
pred_all =[]
for counter,clf in enumerate(clfs):
clf.fit(X,y)
if oobe == True:
pred = clf.predict(X_test)*scores[counter]
else:
pred = clf.predict(X_test)
if counter ==0 :
pred_all = pred
else:
pred_all = pred_all+pred
pred_all = pred_all/len(clfs)
return pred_all, clfs
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment