Last active
May 23, 2021 21:21
-
-
Save shantoroy/9bb4da0b2a281e3c91cc836045b6c74d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Modified from source: https://machinelearningmastery.com/feature-selection-machine-learning-python/ | |
# Feature Selection with Univariate Statistical Tests | |
from pandas import read_csv | |
from numpy import set_printoptions | |
from sklearn.feature_selection import SelectKBest | |
from sklearn.feature_selection import f_classif | |
from sklearn.feature_selection import chi2 | |
# select best features from all features using ANOVA (f_classif()) | |
def univariate_stat(df, names, no_of_best): | |
print("##############################") | |
print("####### f_classif, chi2 ######") | |
print("##############################") | |
df = df[names] | |
# considering the last column as class labels | |
array = df.values | |
X = array[:,0:len(names)-1] | |
Y = array[:,len(names)-1] | |
stat_list = [f_classif, chi2] | |
for stat_test in stat_list: | |
# feature extraction | |
test = SelectKBest(score_func=stat_test, k=no_of_best) | |
fit = test.fit(X, Y) | |
# summarize scores | |
set_printoptions(precision=3) | |
# print(fit.scores_) | |
score = {} | |
for i,j in zip(names, list(fit.scores_)): | |
score[i] = j | |
feature_scores = dict(sorted(score.items(), key=lambda item: item[1], reverse=True)) | |
# print(feature_scores) | |
print("") | |
print("{:<15} {:<10}".format('Feature','Score')) | |
for k, v in feature_scores.items(): | |
print("{:<15} {:<10}".format(k, v)) | |
# Feature Extraction with RFE | |
from sklearn.feature_selection import RFE | |
from sklearn.linear_model import LogisticRegression | |
def recursive_feature_eliminate(df, names, no_of_best): | |
print("##############################") | |
print("############# RFE ############") | |
print("##############################") | |
df = df[names] | |
# considering the last column as class labels | |
array = df.values | |
X = array[:,0:len(names)-1] | |
Y = array[:,len(names)-1] | |
# feature extraction | |
model = LogisticRegression(solver='lbfgs') | |
rfe = RFE(model, no_of_best) | |
fit = rfe.fit(X, Y) | |
# print("Num Features: %d" % fit.n_features_) | |
# print("Selected Features: %s" % fit.support_) | |
# print("Feature Ranking: %s" % fit.ranking_) | |
selection = {} | |
for i,j in zip(names, list(fit.ranking_)): | |
selection[i] = j | |
support = {} | |
for i,j in zip(names, list(fit.support_)): | |
support[i] = j | |
# print(support) | |
print("{:<15} {:<10}".format('Feature','Support')) | |
for k, v in support.items(): | |
print("{:<15} {:<10}".format(k, v)) | |
feature_rank = dict(sorted(selection.items(), key=lambda item: item[1])) | |
# print(feature_rank) | |
print("") | |
print("{:<15} {:<10}".format('Feature','Rank')) | |
for k, v in feature_rank.items(): | |
print("{:<15} {:<10}".format(k, v)) | |
# Feature Importance with Extra Trees Classifier | |
from sklearn.ensemble import ExtraTreesClassifier | |
def extra_tree_classifier(df, names): | |
print("##############################") | |
print("#### ExtraTreesClassifier ####") | |
print("##############################") | |
df = df[names] | |
# considering the last column as class labels | |
array = df.values | |
X = array[:,0:len(names)-1] | |
Y = array[:,len(names)-1] | |
# feature extraction | |
model = ExtraTreesClassifier(n_estimators=10) | |
model.fit(X, Y) | |
# print(model.feature_importances_) | |
importance = {} | |
for i,j in zip(names, list(model.feature_importances_)): | |
importance[i] = j | |
feature_importance = dict(sorted(importance.items(), key=lambda item: item[1], reverse=True)) | |
# print(feature_importance) | |
print("{:<15} {:<10}".format('Feature','Importance')) | |
for k, v in feature_importance.items(): | |
print("{:<15} {:<10}".format(k, v)) | |
if __name__ == "__main__": | |
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv" | |
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] | |
df = read_csv(url, names=names) | |
no_of_best = int(input("Enter the no. of best features: ")) | |
print("") | |
univariate_stat(df, names, no_of_best) | |
print("") | |
recursive_feature_eliminate(df, names, no_of_best) | |
print("") | |
extra_tree_classifier(df, names) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment