Skip to content

Instantly share code, notes, and snippets.

@egemenzeytinci
Last active April 30, 2020 10:59
Show Gist options
  • Save egemenzeytinci/66141fa85b190730541be4a29d422818 to your computer and use it in GitHub Desktop.
Save egemenzeytinci/66141fa85b190730541be4a29d422818 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-30T10:58:37.368288Z",
"start_time": "2020-04-30T10:58:35.959776Z"
}
},
"outputs": [],
"source": [
"%%capture\n",
"!pip3 install ycimpute"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-30T10:58:40.905295Z",
"start_time": "2020-04-30T10:58:37.372360Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--------------------Before--------------------\n",
"Unique values of sex: [nan 0. 1.]\n",
"Unique count of age: 39\n",
"Sex 73\n",
"Age 100\n",
"dtype: int64\n",
"--------------------After---------------------\n",
"Unique values of sex: [1. 0.]\n",
"Unique count of age: 107\n",
"Sex 0\n",
"Age 0\n",
"dtype: int64\n"
]
}
],
"source": [
"from ycimpute.imputer import iterforest\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"url = 'https://git.io/JvW9W'\n",
"cols = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked']\n",
"data = pd.read_csv(url, sep='\\t').drop(columns=cols).copy()\n",
"\n",
"fake = data.copy()\n",
"np.random.seed(100)\n",
"\n",
"cols = ['Sex', 'Age']\n",
"mask = np.random.choice([True, False], size=fake[cols].shape)\n",
"fake[cols] = fake[cols].mask(mask)\n",
"\n",
"fake['Sex'] = fake['Sex'].replace({'female': 0, 'male': 1})\n",
"\n",
"print('--' * 10 + 'Before' + '--' * 10)\n",
"print(f\"Unique values of sex: {fake['Sex'].unique()}\")\n",
"print(f\"Unique count of age: {fake['Age'].nunique()}\")\n",
"print(fake[cols].isnull().sum())\n",
"\n",
"X = np.array(fake)\n",
"dff = iterforest.MissForest().complete(X)\n",
"filled = pd.DataFrame(dff, columns=fake.columns)\n",
"\n",
"print('--' * 10 + 'After-' + '--' * 10)\n",
"print(f\"Unique values of sex: {filled['Sex'].unique()}\")\n",
"print(f\"Unique count of age: {filled['Age'].nunique()}\")\n",
"print(filled[cols].isnull().sum())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-30T10:58:40.920068Z",
"start_time": "2020-04-30T10:58:40.909327Z"
}
},
"outputs": [],
"source": [
"filled.Sex = filled.Sex.replace({0: 'female', 1: 'male'})"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-30T10:58:40.951439Z",
"start_time": "2020-04-30T10:58:40.922752Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Survived</th>\n",
" <th>Pclass</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Parch</th>\n",
" <th>Fare</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" <td>male</td>\n",
" <td>31.713524</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>7.2500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>female</td>\n",
" <td>38.000000</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>71.2833</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1.0</td>\n",
" <td>3.0</td>\n",
" <td>female</td>\n",
" <td>26.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>7.9250</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>female</td>\n",
" <td>25.705533</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>53.1000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" <td>female</td>\n",
" <td>19.348333</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>8.0500</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Survived Pclass Sex Age SibSp Parch Fare\n",
"0 0.0 3.0 male 31.713524 1.0 0.0 7.2500\n",
"1 1.0 1.0 female 38.000000 1.0 0.0 71.2833\n",
"2 1.0 3.0 female 26.000000 0.0 0.0 7.9250\n",
"3 1.0 1.0 female 25.705533 1.0 0.0 53.1000\n",
"4 0.0 3.0 female 19.348333 0.0 0.0 8.0500"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"filled.head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment