Skip to content

Instantly share code, notes, and snippets.

@veb-101
Created April 17, 2020 21:52
Show Gist options
  • Save veb-101/d747d5841eba9ae9fe51ff9bacddfa42 to your computer and use it in GitHub Desktop.
Save veb-101/d747d5841eba9ae9fe51ff9bacddfa42 to your computer and use it in GitHub Desktop.
Feature Selection.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
},
"colab": {
"name": "Feature Selection.ipynb",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/veb-101/d747d5841eba9ae9fe51ff9bacddfa42/feature-selection.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "hx-k-wxEmaNx",
"colab_type": "code",
"colab": {}
},
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from numpy import set_printoptions\n",
"from sklearn.feature_selection import SelectKBest\n",
"from sklearn.feature_selection import f_classif, chi2\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score as acc\n",
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-17T17:58:44.074634Z",
"start_time": "2020-04-17T17:58:44.066536Z"
},
"id": "8XG3q6sGlhJW",
"colab_type": "text"
},
"source": [
"# Filter based methods"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "_-4K9Al4-MEa",
"colab_type": "text"
},
"source": [
"## Feature Selection with Univariate Statistical Tests"
]
},
{
"cell_type": "code",
"metadata": {
"id": "p0hLX9hDmaQb",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 196
},
"outputId": "1a77cb07-f90f-4827-89b7-3f42803b4c8f"
},
"source": [
"# Feature Selection with Univariate Statistical Tests\n",
"\n",
"# load data\n",
"filename = '/content/diabetes_data.csv'\n",
"names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']\n",
"\n",
"dataframe = pd.read_csv(filename, skiprows=1, names=names)\n",
"dataframe.head()"
],
"execution_count": 2,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>preg</th>\n",
" <th>plas</th>\n",
" <th>pres</th>\n",
" <th>skin</th>\n",
" <th>test</th>\n",
" <th>mass</th>\n",
" <th>pedi</th>\n",
" <th>age</th>\n",
" <th>class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>6</td>\n",
" <td>148</td>\n",
" <td>72</td>\n",
" <td>35</td>\n",
" <td>0</td>\n",
" <td>33.6</td>\n",
" <td>0.627</td>\n",
" <td>50</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>85</td>\n",
" <td>66</td>\n",
" <td>29</td>\n",
" <td>0</td>\n",
" <td>26.6</td>\n",
" <td>0.351</td>\n",
" <td>31</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>8</td>\n",
" <td>183</td>\n",
" <td>64</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>23.3</td>\n",
" <td>0.672</td>\n",
" <td>32</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>89</td>\n",
" <td>66</td>\n",
" <td>23</td>\n",
" <td>94</td>\n",
" <td>28.1</td>\n",
" <td>0.167</td>\n",
" <td>21</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>137</td>\n",
" <td>40</td>\n",
" <td>35</td>\n",
" <td>168</td>\n",
" <td>43.1</td>\n",
" <td>2.288</td>\n",
" <td>33</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" preg plas pres skin test mass pedi age class\n",
"0 6 148 72 35 0 33.6 0.627 50 1\n",
"1 1 85 66 29 0 26.6 0.351 31 0\n",
"2 8 183 64 0 0 23.3 0.672 32 1\n",
"3 1 89 66 23 94 28.1 0.167 21 0\n",
"4 0 137 40 35 168 43.1 2.288 33 1"
]
},
"metadata": {
"tags": []
},
"execution_count": 2
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "WpYnRpv8maTL",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 33
},
"outputId": "1772be25-10a6-41f9-dc69-4119aabfa60f"
},
"source": [
"array = dataframe.values\n",
"X = array[:,0:8]\n",
"Y = array[:,8]\n",
"\n",
"# feature extraction using SelectKBest\n",
"\n",
"# selecting the best 5 features based in f_classif criteria\n",
"test = SelectKBest(score_func=chi2, k=6)\n",
"feature_set = test.fit(X, Y)\n",
"\n",
"# summarize scores\n",
"np.set_printoptions(precision=3)\n",
"print(feature_set.scores_)\n",
"\n",
"# new X based on 5 best features\n",
"X_new = feature_set.transform(X)"
],
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"text": [
"[ 111.52 1411.887 17.605 53.108 2175.565 127.669 5.393 181.304]\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "plbDaZHWmaWt",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 50
},
"outputId": "b10e941c-09ac-4eb8-f161-de03bad88cae"
},
"source": [
"# untransformed\n",
"X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)\n",
"\n",
"lr = LogisticRegression()\n",
"lr.fit(X_train, y_train)\n",
"y_pred = lr.predict(X_test)\n",
"print(f\"Whole Acc: {acc(y_test, y_pred)}\")\n",
"\n",
"# using transformed X for prediction\n",
"\n",
"X_train_trans, X_test_trans, y_train, y_test = train_test_split(X_new, Y, test_size=0.2, stratify=Y, random_state=42)\n",
"\n",
"lr = LogisticRegression()\n",
"lr.fit(X_train_trans, y_train)\n",
"y_pred = lr.predict(X_test_trans)\n",
"print(f\"Transformed Acc: {acc(y_test, y_pred)}\")"
],
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"text": [
"Whole Acc: 0.7142857142857143\n",
"Transformed Acc: 0.7077922077922078\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "5eStirSe_SK6",
"colab_type": "text"
},
"source": [
"# Wrapper Method"
]
},
{
"cell_type": "code",
"metadata": {
"id": "OCkkEsyJ_U6i",
"colab_type": "code",
"colab": {}
},
"source": [
"# load data\n",
"filename = '/content/diabetes_data.csv'\n",
"names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']\n",
"\n",
"dataframe = pd.read_csv(filename, skiprows=1, names=names)\n",
"\n",
"X = dataframe[dataframe.columns[:-1]]\n",
"Y = dataframe[dataframe.columns[-1]]\n",
"X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "0TrwwU2n_U8s",
"colab_type": "code",
"colab": {}
},
"source": [
"# Forward Feature Selection"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "cY-d_KsT_VBD",
"colab_type": "code",
"colab": {}
},
"source": [
"!pip install mlxtend -q"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "fXQqSTAv_VEY",
"colab_type": "code",
"colab": {}
},
"source": [
"from mlxtend.feature_selection import SequentialFeatureSelector\n",
"from sklearn.ensemble import RandomForestClassifier"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "2fKAkCiA_VPn",
"colab_type": "code",
"colab": {}
},
"source": [
"sfs = SequentialFeatureSelector(RandomForestClassifier(), \n",
" k_features=5, \n",
" forward=True, \n",
" floating=False,\n",
" scoring='accuracy',\n",
" cv=3)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "RDa-yp0T_VTl",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 50
},
"outputId": "3608d7e7-26e9-4a23-c5c0-db1337c7e85f"
},
"source": [
"# fit the object to the training data.\n",
"sfs = sfs.fit(X_train, y_train)\n",
"\n",
"# print the selected features.\n",
"selected_features = X_train.columns[list(sfs.k_feature_idx_)]\n",
"print(selected_features)\n",
"\n",
"# print the final prediction score.\n",
"print(sfs.k_score_)\n",
"\n",
"# transform to the newly selected features.\n",
"x_train_sfs = sfs.transform(X_train)\n",
"x_test_sfs = sfs.transform(X_test)"
],
"execution_count": 10,
"outputs": [
{
"output_type": "stream",
"text": [
"Index(['preg', 'plas', 'mass', 'pedi', 'age'], dtype='object')\n",
"0.7752829587119402\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Di7gCn5o_VW-",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 33
},
"outputId": "5f335902-aa60-4597-fbd9-7f43ba7e6f61"
},
"source": [
"rnd_clf = RandomForestClassifier()\n",
"rnd_clf.fit(x_train_sfs, y_train)\n",
"y_pred = rnd_clf.predict(x_test_sfs)\n",
"print(f\"Transformed Acc: {acc(y_test, y_pred)}\")"
],
"execution_count": 11,
"outputs": [
{
"output_type": "stream",
"text": [
"Transformed Acc: 0.7337662337662337\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "0sACnPxSBSLa",
"colab_type": "text"
},
"source": [
"# Embedded"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Dqqu1RmoBhsl",
"colab_type": "text"
},
"source": [
"## L1- based feature selection"
]
},
{
"cell_type": "code",
"metadata": {
"id": "4IIahgWLBfEt",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 50
},
"outputId": "b8372a75-5af0-49fb-fad9-03e9e0791a66"
},
"source": [
"from sklearn.svm import LinearSVC\n",
"from sklearn.datasets import load_iris\n",
"from sklearn.feature_selection import SelectFromModel\n",
"\n",
"X, y = load_iris(return_X_y=True)\n",
"print(X.shape)\n",
"\n",
"lsvc = LinearSVC().fit(X, y)\n",
"model = SelectFromModel(lsvc, prefit=True)\n",
"X_new = model.transform(X)\n",
"print(X_new.shape)"
],
"execution_count": 12,
"outputs": [
{
"output_type": "stream",
"text": [
"(150, 4)\n",
"(150, 3)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "RcKy0I7DBfIA",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 33
},
"outputId": "b560cce2-17d5-4f9f-918c-7327f776abf1"
},
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.15, stratify=y, random_state=42)\n",
"lin_svm = LinearSVC()\n",
"lin_svm.fit(X_train, y_train)\n",
"print(f\"Transformed Accuracy: {acc(y_test, lin_svm.predict(X_test))}\")"
],
"execution_count": 13,
"outputs": [
{
"output_type": "stream",
"text": [
"Transformed Accuracy: 0.9130434782608695\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "TMp99c7LDpYD",
"colab_type": "text"
},
"source": [
"# Hybrid"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "_koJ8y3ODq4L",
"colab_type": "text"
},
"source": [
"## Recursive feature elimination"
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-17T18:33:03.523356Z",
"start_time": "2020-04-17T18:33:03.515292Z"
},
"id": "c0gslvTYlhJh",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 382
},
"outputId": "41ae04f8-807d-45e1-8b13-327b18c948f0"
},
"source": [
"# Feature Extraction with RFE\n",
"from sklearn.svm import SVC\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.feature_selection import RFECV\n",
"\n",
"\n",
"# load data\n",
"filename = '/content/diabetes_data.csv'\n",
"names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']\n",
"\n",
"dataframe = pd.read_csv(filename, skiprows=1, names=names)\n",
"\n",
"X = dataframe[dataframe.columns[:-1]]\n",
"Y = dataframe[dataframe.columns[-1]]\n",
"\n",
"# feature extraction\n",
"svc = SVC(kernel=\"linear\")\n",
"\n",
"rfecv = RFECV(estimator=svc, step=2, cv=StratifiedKFold(3),\n",
" scoring='accuracy')\n",
"fit = rfecv.fit(X, Y)\n",
"\n",
"print(\"Num Features: %d\" % fit.n_features_)\n",
"print(dataframe.columns)\n",
"print(\"Selected Features: %s\" % fit.support_)\n",
"print(\"Feature Ranking: %s\" % fit.ranking_)\n",
"print(\"Optimal number of features : %d\" % rfecv.n_features_)\n",
"\n",
"# Plot number of features VS. cross-validation scores\n",
"plt.figure()\n",
"plt.xlabel(\"Number of features selected\")\n",
"plt.ylabel(\"Cross validation score (nb of correct classifications)\")\n",
"plt.plot(range(1, len(fit.grid_scores_) + 1), fit.grid_scores_)\n",
"plt.show()"
],
"execution_count": 14,
"outputs": [
{
"output_type": "stream",
"text": [
"Num Features: 6\n",
"Index(['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'], dtype='object')\n",
"Selected Features: [ True True True False False True True True]\n",
"Feature Ranking: [1 1 1 2 2 1 1 1]\n",
"Optimal number of features : 6\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"tags": [],
"needs_background": "light"
}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "2Nckjew3GL2v",
"colab_type": "text"
},
"source": [
"# Tree Based"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "AcqQy_SoGPf7",
"colab_type": "text"
},
"source": [
"## Feature importance"
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-17T18:33:36.916676Z",
"start_time": "2020-04-17T18:33:36.912199Z"
},
"id": "rC2fDzjZlhJj",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 150
},
"outputId": "5cc844fc-1d07-4b06-de64-3df3f42f73b5"
},
"source": [
"from sklearn.ensemble import ExtraTreesClassifier\n",
"\n",
"# load data\n",
"filename = '/content/diabetes_data.csv'\n",
"names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']\n",
"\n",
"dataframe = pd.read_csv(filename, skiprows=1, names=names)\n",
"\n",
"X = dataframe[dataframe.columns[:-1]]\n",
"Y = dataframe[dataframe.columns[-1]]\n",
"\n",
"model = ExtraTreesClassifier(n_estimators=10)\n",
"model.fit(X, Y)\n",
"\n",
"for score, name in sorted(zip(model.feature_importances_, dataframe.columns), reverse=True):\n",
" print(f\"{name}: {score}\")"
],
"execution_count": 15,
"outputs": [
{
"output_type": "stream",
"text": [
"plas: 0.2099093726365222\n",
"age: 0.14819210935381139\n",
"mass: 0.13157244530019407\n",
"preg: 0.12348960652746636\n",
"pedi: 0.11552694358738022\n",
"pres: 0.10794313979850405\n",
"skin: 0.0829651270073681\n",
"test: 0.08040125578875357\n"
],
"name": "stdout"
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment