Skip to content

Instantly share code, notes, and snippets.

@iaroslav-ai
Created October 31, 2018 14:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iaroslav-ai/3bfceeb277960f9585f284212f838963 to your computer and use it in GitHub Desktop.
Save iaroslav-ai/3bfceeb277960f9585f284212f838963 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Analysis of SMOTE data preprocessing approach\n",
"\n",
"It is important to ensure that data agumentation approach does not produce outliers. This might happen sometimes with SMOTE, when convex combinations of feature vectors end up being in another class cluster. A simple way to test for such situation is to compare validation accuracy with and without data augmentation."
]
},
{
"cell_type": "code",
"execution_count": 201,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Positive class instances: 11\n",
"Negative class instances: 86\n"
]
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from imblearn.over_sampling import SMOTE\n",
"\n",
"np.random.seed(1)\n",
"\n",
"# Example disballanced data to be fixed by SMOTE\n",
"# Data with positive class points surrounding two \n",
"# clusters of negative data points.\n",
"X = np.random.randn(200, 2)\n",
"y = np.sum(X ** 2, axis=-1) > 1\n",
"I = 2*(np.random.rand(len(X)) > 0.5)-1\n",
"X = (X.T+I.T).T\n",
"\n",
"# Make data disballanced\n",
"I = np.copy(y)\n",
"I[::10] = False\n",
"I = ~I\n",
"X = X[I]\n",
"y = y[I]\n",
"\n",
"print('Positive class instances: %s' % np.sum(y == True))\n",
"print('Negative class instances: %s' % np.sum(y == False))\n",
"\n",
"# resample\n",
"smote = SMOTE()\n",
"Xr, yr = smote.fit_resample(X, y)\n",
"\n",
"# visualize results\n",
"def plot_data(X, y, title):\n",
" plt.title(title)\n",
" plt.scatter(X[~y, 0], X[~y, 1])\n",
" plt.scatter(X[y, 0], X[y, 1])\n",
" \n",
"plt.subplot(1, 2, 1)\n",
"plot_data(X, y, 'Original')\n",
"plt.subplot(1, 2, 2)\n",
"plot_data(Xr, yr, 'SMOTE')\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment