Skip to content

Instantly share code, notes, and snippets.

@puraminy
Last active December 23, 2019 07:04
Show Gist options
  • Save puraminy/f1b1a19765b143ac27249ab369e4f1d7 to your computer and use it in GitHub Desktop.
Save puraminy/f1b1a19765b143ac27249ab369e4f1d7 to your computer and use it in GitHub Desktop.
Forward Feature Selection in Machine Learning
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
x_train = pd.read_csv('Train_Data.csv')
y_train = pd.read_csv('Train_Labels.csv')
x_test = pd.read_csv('Test_Data.csv')
y_test = pd.read_csv('Test_Labels.csv')
print(x_train.shape)
x_train.shape
r = -1
x_train = x_train.iloc[:r]
y_train = y_train.iloc[:r]
x_test = x_test.iloc[:r]
y_test = y_test.iloc[:r]
def preprocess(data):
# in this part we scale data between [0.1]
min_max_scaler = preprocessing.MinMaxScaler()
x_train_minmax = min_max_scaler.fit_transform(data)
return x_train_minmax
x_train = preprocess(x_train)
x_test = preprocess(x_test)
def forward_selection(data, response, classifier, alpha=0.01, exclude=[]):
# Specify the columns that you want to exclude from your model
cols = data.shape[1]
old_acc = 0
sel_cols = []
diff = 1000
acc_list = []
while (diff > alpha):
max_acc = 0
selected_col = None
for col in range(cols):
if (not col in exclude):
# print(col, end=" ")
x_train_selcols = data[:,sel_cols + [col]]
x_test_selcols = x_test[:,sel_cols + [col]]
classifier.fit(x_train_selcols, response)
y_pred = classifier.predict(x_test_selcols)
acc = accuracy_score(y_test, y_pred).round(4)
if (acc > max_acc):
# print((col,acc))
max_acc = acc
selected_col = col
diff = max_acc - old_acc
# print("max_acc, old_acc, diff", (max_acc, old_acc, diff))
exclude.append(selected_col)
if (diff > alpha):
sel_cols.append(selected_col)
print("{: <80} acc: {}".format('{}'.format(sel_cols), max_acc))
acc_list.append(max_acc)
old_acc = max_acc
else:
print("\n==> No feature to add (based on the difference treshold of alpha:", alpha, ")\n")
return sel_cols, acc_list
# import warnings
# warnings.filterwarnings('ignore')
from sklearn.neighbors import KNeighborsClassifier
naive_bayes = GaussianNB()
sel_cols, acc_list = forward_selection(x_train, y_train, naive_bayes, alpha=0.001)
print("selected features:", sel_cols, "final acc:", acc_list[-1])
fig = plt.figure(figsize=(10,8))
plt.plot(acc_list)
plt.savefig('plot.png')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment