Last active
April 8, 2017 02:40
-
-
Save kevincdurand1/1b69eb55567a36e2436fb42dabcffe74 to your computer and use it in GitHub Desktop.
Feature Selection Using Random Forest #tags: Feature Selection
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn import datasets | |
from sklearn.model_selection import train_test_split | |
from sklearn.feature_selection import SelectFromModel | |
from sklearn.metrics import accuracy_score | |
# Load the iris dataset | |
iris = datasets.load_iris() | |
# Create a list of feature names | |
feat_labels = ['Sepal Length','Sepal Width','Petal Length','Petal Width'] | |
# Create X from the features | |
X = iris.data | |
# Create y from output | |
y = iris.target | |
# View the features | |
X[0:5] | |
#array([[ 5.1, 3.5, 1.4, 0.2], | |
# [ 4.9, 3. , 1.4, 0.2], | |
# [ 4.7, 3.2, 1.3, 0.2], | |
# [ 4.6, 3.1, 1.5, 0.2], | |
# [ 5. , 3.6, 1.4, 0.2]]) | |
# View the target data | |
y | |
# Split the data into 40% test and 60% training | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) | |
# Create a random forest classifier | |
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1) | |
# Train the classifier | |
clf.fit(X_train, y_train) | |
# Print the name and gini importance of each feature | |
for feature in zip(feat_labels, clf.feature_importances_): | |
print(feature) | |
# Create a selector object that will use the random forest classifier to identify | |
# features that have an importance of more than 0.15 | |
sfm = SelectFromModel(clf, threshold=0.15) | |
# Train the selector | |
sfm.fit(X_train, y_train) | |
# Print the names of the most important features | |
for feature_list_index in sfm.get_support(indices=True): | |
print(feat_labels[feature_list_index]) | |
# Transform the data to create a new dataset containing only the most important features | |
# Note: We have to apply the transform to both the training X and test X data. | |
X_important_train = sfm.transform(X_train) | |
X_important_test = sfm.transform(X_test) | |
# Create a new random forest classifier for the most important features | |
clf_important = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1) | |
# Train the new classifier on the new dataset containing the most important features | |
clf_important.fit(X_important_train, y_train) | |
# Apply The Full Featured Classifier To The Test Data | |
y_pred = clf.predict(X_test) | |
# View The Accuracy Of Our Full Feature (4 Features) Model | |
accuracy_score(y_test, y_pred) | |
#0.93333333333333335 | |
# Apply The Full Featured Classifier To The Test Data | |
y_important_pred = clf_important.predict(X_important_test) | |
# View The Accuracy Of Our Limited Feature (2 Features) Model | |
accuracy_score(y_test, y_important_pred) | |
#0.8833333333333333 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment