Skip to content

Instantly share code, notes, and snippets.

@finlytics-hub
Last active July 16, 2020 11:59
Show Gist options
  • Save finlytics-hub/ca6c4a3e5c9b87001027a937717fc61b to your computer and use it in GitHub Desktop.
Save finlytics-hub/ca6c4a3e5c9b87001027a937717fc61b to your computer and use it in GitHub Desktop.
RFECV practical demonstration with multiple models evaluated within RFECV
# import all the required libraries
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.pipeline import Pipeline
# load dataset
data = pd.read_csv('abc.csv')
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
# create pipeline of differennt base algorithms to be used in RFECV (no. of features will be auto-selected based on cv in RFECV)
models = {}
# logistic regression
rfecv = RFECV(estimator = LogisticRegression(), cv = 10, scoring = 'accuracy')
model = DecisionTreeClassifier()
models['LR'] = Pipeline(steps = [('features', rfecv), ('model', model)])
# decision tree
rfecv = RFECV(estimator = DecisionTreeClassifier(), cv = 10, scoring = 'accuracy')
model = DecisionTreeClassifier()
models['DT'] = Pipeline(steps = [('features', rfe), ('model', model)])
# random forest
rfecv = RFECV(estimator = RandomForestClassifier(), cv = 10, scoring = 'accuracy')
model = DecisionTreeClassifier()
models['RF'] = Pipeline(steps = [('features', rfe), ('model', model)])
# XGBoost Classifier
rfecv = RFECV(estimator=XGBClassifier(), cv = 10, scoring = 'accuracy')
model = DecisionTreeClassifier()
models['XGB'] = Pipeline(steps = [('features', rfecv), ('model', model)])
# evaluate all the models
results = []
names = []
for name, model in models.items():
cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
scores = cross_val_score(model, X, y, scoring = 'accuracy', cv = cv, n_jobs = -1)
results.append(scores)
names.append(name)
print('>%s: %.3f' % (name, np.mean(scores)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment