egemenzeytinci/ycimpute_test.ipynb

## ycimpute_test.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-30T10:58:37.368288Z",
     "start_time": "2020-04-30T10:58:35.959776Z"
    }
   },
   "outputs": [],
   "source": [
    "%%capture\n",
    "!pip3 install ycimpute"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-30T10:58:40.905295Z",
     "start_time": "2020-04-30T10:58:37.372360Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--------------------Before--------------------\n",
      "Unique values of sex: [nan  0.  1.]\n",
      "Unique count of age: 39\n",
      "Sex     73\n",
      "Age    100\n",
      "dtype: int64\n",
      "--------------------After---------------------\n",
      "Unique values of sex: [1. 0.]\n",
      "Unique count of age: 107\n",
      "Sex    0\n",
      "Age    0\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "from ycimpute.imputer import iterforest\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "url = 'https://git.io/JvW9W'\n",
    "cols = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked']\n",
    "data = pd.read_csv(url, sep='\\t').drop(columns=cols).copy()\n",
    "\n",
    "fake = data.copy()\n",
    "np.random.seed(100)\n",
    "\n",
    "cols = ['Sex', 'Age']\n",
    "mask = np.random.choice([True, False], size=fake[cols].shape)\n",
    "fake[cols] = fake[cols].mask(mask)\n",
    "\n",
    "fake['Sex'] = fake['Sex'].replace({'female': 0, 'male': 1})\n",
    "\n",
    "print('--' * 10 + 'Before' + '--' * 10)\n",
    "print(f\"Unique values of sex: {fake['Sex'].unique()}\")\n",
    "print(f\"Unique count of age: {fake['Age'].nunique()}\")\n",
    "print(fake[cols].isnull().sum())\n",
    "\n",
    "X = np.array(fake)\n",
    "dff = iterforest.MissForest().complete(X)\n",
    "filled = pd.DataFrame(dff, columns=fake.columns)\n",
    "\n",
    "print('--' * 10 + 'After-' + '--' * 10)\n",
    "print(f\"Unique values of sex: {filled['Sex'].unique()}\")\n",
    "print(f\"Unique count of age: {filled['Age'].nunique()}\")\n",
    "print(filled[cols].isnull().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-30T10:58:40.920068Z",
     "start_time": "2020-04-30T10:58:40.909327Z"
    }
   },
   "outputs": [],
   "source": [
    "filled.Sex = filled.Sex.replace({0: 'female', 1: 'male'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-30T10:58:40.951439Z",
     "start_time": "2020-04-30T10:58:40.922752Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>male</td>\n",
       "      <td>31.713524</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>7.2500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>female</td>\n",
       "      <td>38.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>71.2833</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>female</td>\n",
       "      <td>26.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>7.9250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>female</td>\n",
       "      <td>25.705533</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>53.1000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>female</td>\n",
       "      <td>19.348333</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>8.0500</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  Pclass     Sex        Age  SibSp  Parch     Fare\n",
       "0       0.0     3.0    male  31.713524    1.0    0.0   7.2500\n",
       "1       1.0     1.0  female  38.000000    1.0    0.0  71.2833\n",
       "2       1.0     3.0  female  26.000000    0.0    0.0   7.9250\n",
       "3       1.0     1.0  female  25.705533    1.0    0.0  53.1000\n",
       "4       0.0     3.0  female  19.348333    0.0    0.0   8.0500"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filled.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2020-04-30T10:58:37.368288Z",
	"start_time": "2020-04-30T10:58:35.959776Z"
	}
	},
	"outputs": [],
	"source": [
	"%%capture\n",
	"!pip3 install ycimpute"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2020-04-30T10:58:40.905295Z",
	"start_time": "2020-04-30T10:58:37.372360Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"--------------------Before--------------------\n",
	"Unique values of sex: [nan 0. 1.]\n",
	"Unique count of age: 39\n",
	"Sex 73\n",
	"Age 100\n",
	"dtype: int64\n",
	"--------------------After---------------------\n",
	"Unique values of sex: [1. 0.]\n",
	"Unique count of age: 107\n",
	"Sex 0\n",
	"Age 0\n",
	"dtype: int64\n"
	]
	}
	],
	"source": [
	"from ycimpute.imputer import iterforest\n",
	"import numpy as np\n",
	"import pandas as pd\n",
	"\n",
	"url = 'https://git.io/JvW9W'\n",
	"cols = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked']\n",
	"data = pd.read_csv(url, sep='\\t').drop(columns=cols).copy()\n",
	"\n",
	"fake = data.copy()\n",
	"np.random.seed(100)\n",
	"\n",
	"cols = ['Sex', 'Age']\n",
	"mask = np.random.choice([True, False], size=fake[cols].shape)\n",
	"fake[cols] = fake[cols].mask(mask)\n",
	"\n",
	"fake['Sex'] = fake['Sex'].replace({'female': 0, 'male': 1})\n",
	"\n",
	"print('--' * 10 + 'Before' + '--' * 10)\n",
	"print(f\"Unique values of sex: {fake['Sex'].unique()}\")\n",
	"print(f\"Unique count of age: {fake['Age'].nunique()}\")\n",
	"print(fake[cols].isnull().sum())\n",
	"\n",
	"X = np.array(fake)\n",
	"dff = iterforest.MissForest().complete(X)\n",
	"filled = pd.DataFrame(dff, columns=fake.columns)\n",
	"\n",
	"print('--' * 10 + 'After-' + '--' * 10)\n",
	"print(f\"Unique values of sex: {filled['Sex'].unique()}\")\n",
	"print(f\"Unique count of age: {filled['Age'].nunique()}\")\n",
	"print(filled[cols].isnull().sum())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2020-04-30T10:58:40.920068Z",
	"start_time": "2020-04-30T10:58:40.909327Z"
	}
	},
	"outputs": [],
	"source": [
	"filled.Sex = filled.Sex.replace({0: 'female', 1: 'male'})"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2020-04-30T10:58:40.951439Z",
	"start_time": "2020-04-30T10:58:40.922752Z"
	}
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>Survived</th>\n",
	" <th>Pclass</th>\n",
	" <th>Sex</th>\n",
	" <th>Age</th>\n",
	" <th>SibSp</th>\n",
	" <th>Parch</th>\n",
	" <th>Fare</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>0.0</td>\n",
	" <td>3.0</td>\n",
	" <td>male</td>\n",
	" <td>31.713524</td>\n",
	" <td>1.0</td>\n",
	" <td>0.0</td>\n",
	" <td>7.2500</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>1.0</td>\n",
	" <td>1.0</td>\n",
	" <td>female</td>\n",
	" <td>38.000000</td>\n",
	" <td>1.0</td>\n",
	" <td>0.0</td>\n",
	" <td>71.2833</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>1.0</td>\n",
	" <td>3.0</td>\n",
	" <td>female</td>\n",
	" <td>26.000000</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>7.9250</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>1.0</td>\n",
	" <td>1.0</td>\n",
	" <td>female</td>\n",
	" <td>25.705533</td>\n",
	" <td>1.0</td>\n",
	" <td>0.0</td>\n",
	" <td>53.1000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>0.0</td>\n",
	" <td>3.0</td>\n",
	" <td>female</td>\n",
	" <td>19.348333</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>8.0500</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" Survived Pclass Sex Age SibSp Parch Fare\n",
	"0 0.0 3.0 male 31.713524 1.0 0.0 7.2500\n",
	"1 1.0 1.0 female 38.000000 1.0 0.0 71.2833\n",
	"2 1.0 3.0 female 26.000000 0.0 0.0 7.9250\n",
	"3 1.0 1.0 female 25.705533 1.0 0.0 53.1000\n",
	"4 0.0 3.0 female 19.348333 0.0 0.0 8.0500"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"filled.head()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.5"
	},
	"toc": {
	"base_numbering": 1,
	"nav_menu": {},
	"number_sections": true,
	"sideBar": true,
	"skip_h1_title": false,
	"title_cell": "Table of Contents",
	"title_sidebar": "Contents",
	"toc_cell": false,
	"toc_position": {},
	"toc_section_display": true,
	"toc_window_display": false
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}