Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save kevincdurand1/1b69eb55567a36e2436fb42dabcffe74 to your computer and use it in GitHub Desktop.
Save kevincdurand1/1b69eb55567a36e2436fb42dabcffe74 to your computer and use it in GitHub Desktop.
Feature Selection Using Random Forest #tags: Feature Selection
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
# Load the iris dataset
iris = datasets.load_iris()
# Create a list of feature names
feat_labels = ['Sepal Length','Sepal Width','Petal Length','Petal Width']
# Create X from the features
X = iris.data
# Create y from output
y = iris.target
# View the features
X[0:5]
#array([[ 5.1, 3.5, 1.4, 0.2],
# [ 4.9, 3. , 1.4, 0.2],
# [ 4.7, 3.2, 1.3, 0.2],
# [ 4.6, 3.1, 1.5, 0.2],
# [ 5. , 3.6, 1.4, 0.2]])
# View the target data
y
# Split the data into 40% test and 60% training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
# Train the classifier
clf.fit(X_train, y_train)
# Print the name and gini importance of each feature
for feature in zip(feat_labels, clf.feature_importances_):
print(feature)
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.15
sfm = SelectFromModel(clf, threshold=0.15)
# Train the selector
sfm.fit(X_train, y_train)
# Print the names of the most important features
for feature_list_index in sfm.get_support(indices=True):
print(feat_labels[feature_list_index])
# Transform the data to create a new dataset containing only the most important features
# Note: We have to apply the transform to both the training X and test X data.
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)
# Create a new random forest classifier for the most important features
clf_important = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
# Train the new classifier on the new dataset containing the most important features
clf_important.fit(X_important_train, y_train)
# Apply The Full Featured Classifier To The Test Data
y_pred = clf.predict(X_test)
# View The Accuracy Of Our Full Feature (4 Features) Model
accuracy_score(y_test, y_pred)
#0.93333333333333335
# Apply The Full Featured Classifier To The Test Data
y_important_pred = clf_important.predict(X_important_test)
# View The Accuracy Of Our Limited Feature (2 Features) Model
accuracy_score(y_test, y_important_pred)
#0.8833333333333333
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment