kevincdurand1/Feature Selection Using Random Forest.py

## Feature Selection Using Random Forest.py
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

# Load the iris dataset
iris = datasets.load_iris()

# Create a list of feature names
feat_labels = ['Sepal Length','Sepal Width','Petal Length','Petal Width']

# Create X from the features
X = iris.data

# Create y from output
y = iris.target

# View the features
X[0:5]

#array([[ 5.1,  3.5,  1.4,  0.2],
#       [ 4.9,  3. ,  1.4,  0.2],
#       [ 4.7,  3.2,  1.3,  0.2],
#       [ 4.6,  3.1,  1.5,  0.2],
#       [ 5. ,  3.6,  1.4,  0.2]])

# View the target data
y

# Split the data into 40% test and 60% training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train)

# Print the name and gini importance of each feature
for feature in zip(feat_labels, clf.feature_importances_):
    print(feature)

# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.15
sfm = SelectFromModel(clf, threshold=0.15)

# Train the selector
sfm.fit(X_train, y_train)

# Print the names of the most important features
for feature_list_index in sfm.get_support(indices=True):
    print(feat_labels[feature_list_index])

# Transform the data to create a new dataset containing only the most important features
# Note: We have to apply the transform to both the training X and test X data.
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

# Create a new random forest classifier for the most important features
clf_important = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the new classifier on the new dataset containing the most important features
clf_important.fit(X_important_train, y_train)


# Apply The Full Featured Classifier To The Test Data
y_pred = clf.predict(X_test)

# View The Accuracy Of Our Full Feature (4 Features) Model
accuracy_score(y_test, y_pred)

#0.93333333333333335

# Apply The Full Featured Classifier To The Test Data
y_important_pred = clf_important.predict(X_important_test)

# View The Accuracy Of Our Limited Feature (2 Features) Model
accuracy_score(y_test, y_important_pred)

#0.8833333333333333
	import numpy as np
	from sklearn.ensemble import RandomForestClassifier
	from sklearn import datasets
	from sklearn.model_selection import train_test_split
	from sklearn.feature_selection import SelectFromModel
	from sklearn.metrics import accuracy_score

	# Load the iris dataset
	iris = datasets.load_iris()

	# Create a list of feature names
	feat_labels = ['Sepal Length','Sepal Width','Petal Length','Petal Width']

	# Create X from the features
	X = iris.data

	# Create y from output
	y = iris.target

	# View the features
	X[0:5]

	#array([[ 5.1, 3.5, 1.4, 0.2],
	# [ 4.9, 3. , 1.4, 0.2],
	# [ 4.7, 3.2, 1.3, 0.2],
	# [ 4.6, 3.1, 1.5, 0.2],
	# [ 5. , 3.6, 1.4, 0.2]])

	# View the target data
	y

	# Split the data into 40% test and 60% training
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

	# Create a random forest classifier
	clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

	# Train the classifier
	clf.fit(X_train, y_train)

	# Print the name and gini importance of each feature
	for feature in zip(feat_labels, clf.feature_importances_):
	print(feature)

	# Create a selector object that will use the random forest classifier to identify
	# features that have an importance of more than 0.15
	sfm = SelectFromModel(clf, threshold=0.15)

	# Train the selector
	sfm.fit(X_train, y_train)

	# Print the names of the most important features
	for feature_list_index in sfm.get_support(indices=True):
	print(feat_labels[feature_list_index])

	# Transform the data to create a new dataset containing only the most important features
	# Note: We have to apply the transform to both the training X and test X data.
	X_important_train = sfm.transform(X_train)
	X_important_test = sfm.transform(X_test)

	# Create a new random forest classifier for the most important features
	clf_important = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

	# Train the new classifier on the new dataset containing the most important features
	clf_important.fit(X_important_train, y_train)


	# Apply The Full Featured Classifier To The Test Data
	y_pred = clf.predict(X_test)

	# View The Accuracy Of Our Full Feature (4 Features) Model
	accuracy_score(y_test, y_pred)

	#0.93333333333333335

	# Apply The Full Featured Classifier To The Test Data
	y_important_pred = clf_important.predict(X_important_test)

	# View The Accuracy Of Our Limited Feature (2 Features) Model
	accuracy_score(y_test, y_important_pred)

	#0.8833333333333333