jbanerje/Filling Missing or Incorrect Data.ipynb

## Filling Missing or Incorrect Data.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Import Library for Data manipulation\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.preprocessing import Imputer\n",
    "\n",
    "#Ignoring the warnings\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the dataset\n",
    "dataset = pd.read_csv(\"https://github.com/jbanerje/Data-Science-and-Machine-Learning/tree/master/data/Height_Weight_Data.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 1 : Get the shape of the dataset (rows, columns) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(200, 3)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Observation - we have 200 rows and 3 columns."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2 : Get the column names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Index', 'Height(Inches)', 'Weight(Pounds)']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(dataset.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### We have 3 columns 'Index', 'Height(Inches)', 'Weight(Pounds)' . Always a good practice to remove brackets and replace with underscores."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Index', 'Height_in_Inches', 'Weight_in_Pounds']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Renaming the columns using pandas rename function.\n",
    "dataset = dataset.rename(columns={'Height(Inches)':'Height_in_Inches', 'Weight(Pounds)' : 'Weight_in_Pounds'})\n",
    "\n",
    "#getting the column names again\n",
    "list(dataset.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 3 : Get the column information or Datatype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 200 entries, 0 to 199\n",
      "Data columns (total 3 columns):\n",
      "Index               200 non-null int64\n",
      "Height_in_Inches    197 non-null float64\n",
      "Weight_in_Pounds    198 non-null object\n",
      "dtypes: float64(1), int64(1), object(1)\n",
      "memory usage: 4.8+ KB\n"
     ]
    }
   ],
   "source": [
    "dataset.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Lookout!!!! \n",
    "Weight_in_Pounds is non-numeric. This raises  red flag. We are expecting weight to be a numeric field but it turns out to be object or non-numeric. We will assess this field more"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 4 : Getting basic statistics for the columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Index</th>\n",
       "      <th>Height_in_Inches</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>200.000000</td>\n",
       "      <td>197.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>100.500000</td>\n",
       "      <td>67.590508</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>57.879185</td>\n",
       "      <td>5.211051</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>50.750000</td>\n",
       "      <td>66.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>100.500000</td>\n",
       "      <td>67.930000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>150.250000</td>\n",
       "      <td>69.200000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>200.000000</td>\n",
       "      <td>73.900000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            Index  Height_in_Inches\n",
       "count  200.000000        197.000000\n",
       "mean   100.500000         67.590508\n",
       "std     57.879185          5.211051\n",
       "min      1.000000          0.000000\n",
       "25%     50.750000         66.500000\n",
       "50%    100.500000         67.930000\n",
       "75%    150.250000         69.200000\n",
       "max    200.000000         73.900000"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.describe()  # Basic Statistics on Numeric Columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Observation for Height_in_Inches Column:\n",
    "1. Look at the row - \"count\" . Index has 200 rows while Height_in_Inches contains 197 rows. There must be 3 rows missing data\n",
    "2. Look at the row \"min\" - Height_in_Inches contains 0 value. Minimum height is 0, which is impossible. This must an error."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Weight_in_Pounds</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>198</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unique</th>\n",
       "      <td>194</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>top</th>\n",
       "      <td>141.49</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>freq</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       Weight_in_Pounds\n",
       "count               198\n",
       "unique              194\n",
       "top              141.49\n",
       "freq                  2"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.describe(include=['O'])  # Frequency table for non numeric columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Observation for Weight_in_Pounds Column:\n",
    "1. Look at the row - \"count\" . It contains 198 rows. 2 rows missing data\n",
    "2. Look at the row \"unique\" - It has 194 unique counts, meaning rest 6 rows must be having something different. We will find it out."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 5 : Handling Height_in_Inches Column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Missing Value Rows--> [12, 110, 187] \n",
      "Missing data in rows--> 3\n"
     ]
    }
   ],
   "source": [
    "#Identifying the rows containing missing data\n",
    "missing_value_row = list(dataset[dataset['Height_in_Inches'].isnull()].index)\n",
    "print('Missing Value Rows-->', missing_value_row , '\\nMissing data in rows-->', len(missing_value_row))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Observation:\n",
    "Row 12, 110, 187 contains missing data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 5a : Filling the missing values\n",
    "We can fill the missing values with different options depending on the situation. We can fill with\n",
    "1. Mean\n",
    "2. Median\n",
    "3. Calculated Value\n",
    "4. Hard-Coded Value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "67.93"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Median of all Heights can be found using the below function\n",
    "height_median = dataset['Height_in_Inches'].median()\n",
    "height_median"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "67.59050761421318"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Mean of all Heights can be found using the below function\n",
    "Height_mean = dataset['Height_in_Inches'].mean()\n",
    "Height_mean"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Step 5a.1 Filling missing value with Median"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# We will copy the data into a new datset and replace the values with median .\n",
    "dataset_fix_with_med = dataset.copy()\n",
    "#Replacing missing data with Tenure X MonthlyCharges\n",
    "for missing_row in missing_value_row :\n",
    "    dataset_fix_with_med['Height_in_Inches'][missing_row] = height_median"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "67.93\n",
      "67.93\n",
      "67.93\n"
     ]
    }
   ],
   "source": [
    "#Checking the values in missing rows 12, 110, 187:\n",
    "print(dataset_fix_with_med['Height_in_Inches'][12])\n",
    "print(dataset_fix_with_med['Height_in_Inches'][110])\n",
    "print(dataset_fix_with_med['Height_in_Inches'][187])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Step 5a.2 Filling missing value with Mean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# We will copy the data into a new datset and replace the values with mean .\n",
    "dataset_fix_with_mean = dataset.copy()\n",
    "#Replacing missing data with Tenure X MonthlyCharges\n",
    "for missing_row in missing_value_row :\n",
    "    dataset_fix_with_mean['Height_in_Inches'][missing_row] = Height_mean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "67.59050761421318\n",
      "67.59050761421318\n",
      "67.59050761421318\n"
     ]
    }
   ],
   "source": [
    "#Checking the values in missing rows 12, 110, 187:\n",
    "print(dataset_fix_with_mean['Height_in_Inches'][12])\n",
    "print(dataset_fix_with_mean['Height_in_Inches'][110])\n",
    "print(dataset_fix_with_mean['Height_in_Inches'][187])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Step 5a.3 Filling missing value with Calculated Value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "83.74\n",
      "81.37333333333333\n",
      "80.61333333333333\n"
     ]
    }
   ],
   "source": [
    "# We will copy the data into a new datset  .\n",
    "dataset_fix_with_Calcv = dataset.copy()\n",
    "\n",
    "#Replacing missing data with calculated value. Height field will have weight/1.5 as calculated value\n",
    "for missing_row in missing_value_row :\n",
    "    dataset_fix_with_Calcv['Height_in_Inches'][missing_row] = float(dataset_fix_with_Calcv['Weight_in_Pounds'][missing_row])/1.5\n",
    "\n",
    "#Checking the values in missing rows 12, 110, 187:\n",
    "print(dataset_fix_with_Calcv['Height_in_Inches'][12])\n",
    "print(dataset_fix_with_Calcv['Height_in_Inches'][110])\n",
    "print(dataset_fix_with_Calcv['Height_in_Inches'][187])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Index</th>\n",
       "      <th>Height_in_Inches</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>200.000000</td>\n",
       "      <td>200.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>100.500000</td>\n",
       "      <td>67.595600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>57.879185</td>\n",
       "      <td>5.171788</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>50.750000</td>\n",
       "      <td>66.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>100.500000</td>\n",
       "      <td>67.930000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>150.250000</td>\n",
       "      <td>69.200000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>200.000000</td>\n",
       "      <td>73.900000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            Index  Height_in_Inches\n",
       "count  200.000000        200.000000\n",
       "mean   100.500000         67.595600\n",
       "std     57.879185          5.171788\n",
       "min      1.000000          0.000000\n",
       "25%     50.750000         66.500000\n",
       "50%    100.500000         67.930000\n",
       "75%    150.250000         69.200000\n",
       "max    200.000000         73.900000"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_fix_with_med.describe()\n",
    "#dataset_fix_with_mean.describe()\n",
    "#dataset_fix_with_Calcv.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Observation\n",
    "We have fixed the height_in_inches . Now count is same in as in index, which means missing value is taken care of. Next we have to fix is the min field which is 0. Height cannot be 0. We will identifying the rows containing 0 and replace them with value in 25% percentile."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[158]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "zero_value_row = list(dataset_fix_with_med[dataset_fix_with_med['Height_in_Inches']==0].index)\n",
    "zero_value_row"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# So there is only 1 row with zero value. We will replace that with 25th percentile value\n",
    "dataset_fix_with_med['Height_in_Inches'].quantile(0.25)\n",
    "\n",
    "#Replacing the row 158 with 25th percentile value\n",
    "dataset_fix_with_med['Height_in_Inches'][158] = dataset_fix_with_med['Height_in_Inches'].quantile(0.25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Index</th>\n",
       "      <th>Height_in_Inches</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>200.000000</td>\n",
       "      <td>200.00000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>100.500000</td>\n",
       "      <td>67.92810</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>57.879185</td>\n",
       "      <td>1.91877</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>63.43000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>50.750000</td>\n",
       "      <td>66.50000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>100.500000</td>\n",
       "      <td>67.93000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>150.250000</td>\n",
       "      <td>69.20000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>200.000000</td>\n",
       "      <td>73.90000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            Index  Height_in_Inches\n",
       "count  200.000000         200.00000\n",
       "mean   100.500000          67.92810\n",
       "std     57.879185           1.91877\n",
       "min      1.000000          63.43000\n",
       "25%     50.750000          66.50000\n",
       "50%    100.500000          67.93000\n",
       "75%    150.250000          69.20000\n",
       "max    200.000000          73.90000"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_fix_with_med.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Observation\n",
    "So No missing values and minimum value 0 replaced with 25th percentile"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 5b : Deleting the missing value rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Missing Value Rows--> [12, 110, 187] \n",
      "Missing data in rows--> 3\n"
     ]
    }
   ],
   "source": [
    "# Making a copy of original dataset\n",
    "dataset_fix_with_del = dataset.copy()\n",
    "\n",
    "#Identifying the rows containing missing data\n",
    "missing_value_row = list(dataset_fix_with_del[dataset_fix_with_del['Height_in_Inches'].isnull()].index)\n",
    "print('Missing Value Rows-->', missing_value_row , '\\nMissing data in rows-->', len(missing_value_row))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Index</th>\n",
       "      <th>Height_in_Inches</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>195.000000</td>\n",
       "      <td>195.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>100.179487</td>\n",
       "      <td>67.605436</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>57.819249</td>\n",
       "      <td>5.234161</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>50.500000</td>\n",
       "      <td>66.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>99.000000</td>\n",
       "      <td>67.940000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>149.500000</td>\n",
       "      <td>69.205000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>200.000000</td>\n",
       "      <td>73.900000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            Index  Height_in_Inches\n",
       "count  195.000000        195.000000\n",
       "mean   100.179487         67.605436\n",
       "std     57.819249          5.234161\n",
       "min      1.000000          0.000000\n",
       "25%     50.500000         66.500000\n",
       "50%     99.000000         67.940000\n",
       "75%    149.500000         69.205000\n",
       "max    200.000000         73.900000"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Dropping the rows with empty values\n",
    "dataset_fix_with_del = dataset_fix_with_del.dropna()\n",
    "dataset_fix_with_del.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Observation \n",
    "Although we had 3 empty rows for height, it deleted 5 rows since it found 2 empty rows in the weight column.\n",
    "Beware while using dropna function."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Index</th>\n",
       "      <th>Height_in_Inches</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>197.000000</td>\n",
       "      <td>197.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>100.446701</td>\n",
       "      <td>67.590508</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>57.641870</td>\n",
       "      <td>5.211051</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>51.000000</td>\n",
       "      <td>66.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>100.000000</td>\n",
       "      <td>67.930000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>150.000000</td>\n",
       "      <td>69.200000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>200.000000</td>\n",
       "      <td>73.900000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            Index  Height_in_Inches\n",
       "count  197.000000        197.000000\n",
       "mean   100.446701         67.590508\n",
       "std     57.641870          5.211051\n",
       "min      1.000000          0.000000\n",
       "25%     51.000000         66.500000\n",
       "50%    100.000000         67.930000\n",
       "75%    150.000000         69.200000\n",
       "max    200.000000         73.900000"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Making a copy of original dataset\n",
    "dataset_fix_with_del = dataset.copy()\n",
    "\n",
    "dataset_fix_with_del = dataset_fix_with_del.dropna(subset=['Height_in_Inches'])\n",
    "dataset_fix_with_del.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Observation \n",
    "3 rows with values in height column got deleted."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Index</th>\n",
       "      <th>Height_in_Inches</th>\n",
       "      <th>Weight_in_Pounds</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>65.78</td>\n",
       "      <td>112.99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>71.52</td>\n",
       "      <td>136.49</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>69.40</td>\n",
       "      <td>153.03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>68.22</td>\n",
       "      <td>142.34</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>67.79</td>\n",
       "      <td>144.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>68.70</td>\n",
       "      <td>123.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>69.80</td>\n",
       "      <td>141.49</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>70.01</td>\n",
       "      <td>136.46</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>67.90</td>\n",
       "      <td>112.37</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>66.78</td>\n",
       "      <td>120.67</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>11</td>\n",
       "      <td>66.49</td>\n",
       "      <td>127.45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>12</td>\n",
       "      <td>67.62</td>\n",
       "      <td>114.14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>14</td>\n",
       "      <td>67.12</td>\n",
       "      <td>122.46</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>15</td>\n",
       "      <td>68.28</td>\n",
       "      <td>116.09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>16</td>\n",
       "      <td>71.09</td>\n",
       "      <td>140</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>17</td>\n",
       "      <td>66.46</td>\n",
       "      <td>129.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>18</td>\n",
       "      <td>68.65</td>\n",
       "      <td>142.97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>19</td>\n",
       "      <td>71.23</td>\n",
       "      <td>137.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>20</td>\n",
       "      <td>67.13</td>\n",
       "      <td>124.04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>21</td>\n",
       "      <td>67.83</td>\n",
       "      <td>141.28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>22</td>\n",
       "      <td>68.88</td>\n",
       "      <td>143.54</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>23</td>\n",
       "      <td>63.48</td>\n",
       "      <td>97.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>24</td>\n",
       "      <td>68.42</td>\n",
       "      <td>129.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>25</td>\n",
       "      <td>67.63</td>\n",
       "      <td>141.85</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>26</td>\n",
       "      <td>67.21</td>\n",
       "      <td>129.72</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>27</td>\n",
       "      <td>70.84</td>\n",
       "      <td>142.42</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>28</td>\n",
       "      <td>67.49</td>\n",
       "      <td>131.55</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>29</td>\n",
       "      <td>66.53</td>\n",
       "      <td>108.33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>30</td>\n",
       "      <td>65.44</td>\n",
       "      <td>113.89</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>31</td>\n",
       "      <td>69.52</td>\n",
       "      <td>103.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>169</th>\n",
       "      <td>170</td>\n",
       "      <td>65.99</td>\n",
       "      <td>111.27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>170</th>\n",
       "      <td>171</td>\n",
       "      <td>69.43</td>\n",
       "      <td>122.61</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>171</th>\n",
       "      <td>172</td>\n",
       "      <td>67.97</td>\n",
       "      <td>124.21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>172</th>\n",
       "      <td>173</td>\n",
       "      <td>67.76</td>\n",
       "      <td>124.65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>173</th>\n",
       "      <td>174</td>\n",
       "      <td>65.28</td>\n",
       "      <td>119.52</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>174</th>\n",
       "      <td>175</td>\n",
       "      <td>73.83</td>\n",
       "      <td>139.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>175</th>\n",
       "      <td>176</td>\n",
       "      <td>66.81</td>\n",
       "      <td>104.83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>176</th>\n",
       "      <td>177</td>\n",
       "      <td>66.89</td>\n",
       "      <td>123.04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>177</th>\n",
       "      <td>178</td>\n",
       "      <td>65.74</td>\n",
       "      <td>118.89</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>178</th>\n",
       "      <td>179</td>\n",
       "      <td>65.98</td>\n",
       "      <td>121.49</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>179</th>\n",
       "      <td>180</td>\n",
       "      <td>66.58</td>\n",
       "      <td>119.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>180</th>\n",
       "      <td>181</td>\n",
       "      <td>67.11</td>\n",
       "      <td>135.02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>181</th>\n",
       "      <td>182</td>\n",
       "      <td>65.87</td>\n",
       "      <td>116.23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182</th>\n",
       "      <td>183</td>\n",
       "      <td>66.78</td>\n",
       "      <td>109.17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>183</th>\n",
       "      <td>184</td>\n",
       "      <td>68.74</td>\n",
       "      <td>124.22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>184</th>\n",
       "      <td>185</td>\n",
       "      <td>66.23</td>\n",
       "      <td>141.16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>185</th>\n",
       "      <td>186</td>\n",
       "      <td>65.96</td>\n",
       "      <td>129.15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>186</th>\n",
       "      <td>187</td>\n",
       "      <td>68.58</td>\n",
       "      <td>127.87</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>188</th>\n",
       "      <td>189</td>\n",
       "      <td>66.97</td>\n",
       "      <td>127.65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>189</th>\n",
       "      <td>190</td>\n",
       "      <td>68.08</td>\n",
       "      <td>101.47</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>190</th>\n",
       "      <td>191</td>\n",
       "      <td>70.19</td>\n",
       "      <td>144.99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>191</th>\n",
       "      <td>192</td>\n",
       "      <td>65.52</td>\n",
       "      <td>110.95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>192</th>\n",
       "      <td>193</td>\n",
       "      <td>67.46</td>\n",
       "      <td>132.86</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>193</th>\n",
       "      <td>194</td>\n",
       "      <td>67.41</td>\n",
       "      <td>146.34</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>194</th>\n",
       "      <td>195</td>\n",
       "      <td>69.66</td>\n",
       "      <td>145.59</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>195</th>\n",
       "      <td>196</td>\n",
       "      <td>65.80</td>\n",
       "      <td>120.84</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>196</th>\n",
       "      <td>197</td>\n",
       "      <td>66.11</td>\n",
       "      <td>115.78</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>197</th>\n",
       "      <td>198</td>\n",
       "      <td>68.24</td>\n",
       "      <td>128.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>198</th>\n",
       "      <td>199</td>\n",
       "      <td>68.02</td>\n",
       "      <td>127.47</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>199</th>\n",
       "      <td>200</td>\n",
       "      <td>71.39</td>\n",
       "      <td>127.88</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>197 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Index  Height_in_Inches Weight_in_Pounds\n",
       "0        1             65.78           112.99\n",
       "1        2             71.52           136.49\n",
       "2        3             69.40           153.03\n",
       "3        4             68.22           142.34\n",
       "4        5             67.79            144.3\n",
       "5        6             68.70            123.3\n",
       "6        7             69.80           141.49\n",
       "7        8             70.01           136.46\n",
       "8        9             67.90           112.37\n",
       "9       10             66.78           120.67\n",
       "10      11             66.49           127.45\n",
       "11      12             67.62           114.14\n",
       "13      14             67.12           122.46\n",
       "14      15             68.28           116.09\n",
       "15      16             71.09              140\n",
       "16      17             66.46            129.5\n",
       "17      18             68.65           142.97\n",
       "18      19             71.23            137.9\n",
       "19      20             67.13           124.04\n",
       "20      21             67.83           141.28\n",
       "21      22             68.88           143.54\n",
       "22      23             63.48             97.9\n",
       "23      24             68.42            129.5\n",
       "24      25             67.63           141.85\n",
       "25      26             67.21           129.72\n",
       "26      27             70.84           142.42\n",
       "27      28             67.49           131.55\n",
       "28      29             66.53           108.33\n",
       "29      30             65.44           113.89\n",
       "30      31             69.52            103.3\n",
       "..     ...               ...              ...\n",
       "169    170             65.99           111.27\n",
       "170    171             69.43           122.61\n",
       "171    172             67.97           124.21\n",
       "172    173             67.76           124.65\n",
       "173    174             65.28           119.52\n",
       "174    175             73.83            139.3\n",
       "175    176             66.81           104.83\n",
       "176    177             66.89           123.04\n",
       "177    178             65.74           118.89\n",
       "178    179             65.98           121.49\n",
       "179    180             66.58           119.25\n",
       "180    181             67.11           135.02\n",
       "181    182             65.87           116.23\n",
       "182    183             66.78           109.17\n",
       "183    184             68.74           124.22\n",
       "184    185             66.23           141.16\n",
       "185    186             65.96           129.15\n",
       "186    187             68.58           127.87\n",
       "188    189             66.97           127.65\n",
       "189    190             68.08           101.47\n",
       "190    191             70.19           144.99\n",
       "191    192             65.52           110.95\n",
       "192    193             67.46           132.86\n",
       "193    194             67.41           146.34\n",
       "194    195             69.66           145.59\n",
       "195    196             65.80           120.84\n",
       "196    197             66.11           115.78\n",
       "197    198             68.24            128.3\n",
       "198    199             68.02           127.47\n",
       "199    200             71.39           127.88\n",
       "\n",
       "[197 rows x 3 columns]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Other Flavours \n",
    "    \n",
    "#drop only if ALL columns are NaN\n",
    "dataset_fix_with_del.dropna(how='all')\n",
    "\n",
    "#Drop row if it does not have at least two values that are **not** NaN\n",
    "dataset_fix_with_del.dropna(thresh=2)   \n",
    "\n",
    "#More can be found at : https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.dropna.html\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Replacing missing data with scikit learn Imputer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   Height_in_Inches\n",
      "0             65.78\n",
      "1             71.52\n",
      "2             69.40\n",
      "3             68.22\n",
      "4             67.79\n",
      "Missing Value Rows--> [12, 110, 187] \n",
      "Missing data in rows--> 3\n",
      "Mean: 67.59050761421318\n",
      "Median: 67.93\n"
     ]
    }
   ],
   "source": [
    "#Copying the height data into a dataframe\n",
    "height_data = pd.DataFrame({'Height_in_Inches' : dataset['Height_in_Inches']})\n",
    "print(height_data.head())\n",
    "\n",
    "#Identifying the rows containing missing data\n",
    "missing_value_row = list(height_data[height_data['Height_in_Inches'].isnull()].index)\n",
    "print('Missing Value Rows-->', missing_value_row , '\\nMissing data in rows-->', len(missing_value_row))\n",
    "      \n",
    "#Finding mean & Median\n",
    "print( 'Mean:', height_data['Height_in_Inches'].mean())\n",
    "print( 'Median:', height_data['Height_in_Inches'].median())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Missing Value Rows--> [] \n",
      "Missing data in rows--> 0\n",
      "67.5905076142132\n",
      "67.5905076142132\n",
      "67.5905076142132\n"
     ]
    }
   ],
   "source": [
    "# NaN represents blank values, it will be replaced with mean across the column.\n",
    "# Use strategy = 'median' if you want to replace missing value with median.\n",
    "imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)  \n",
    "\n",
    "# Fit the imputer on the height column.\n",
    "height_data['Height_in_Inches']= imputer.fit_transform(height_data[['Height_in_Inches']])\n",
    "\n",
    "#Identifying the rows containing missing data\n",
    "missing_value_row = list(height_data[height_data['Height_in_Inches'].isnull()].index)\n",
    "print('Missing Value Rows-->', missing_value_row , '\\nMissing data in rows-->', len(missing_value_row))\n",
    "\n",
    "#Verify of the missing rows was replaced with mean\n",
    "print(height_data['Height_in_Inches'][12])\n",
    "print(height_data['Height_in_Inches'][110])\n",
    "print(height_data['Height_in_Inches'][187])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Height_in_Inches</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>200.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>67.590508</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>5.171622</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>66.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>67.865000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>69.200000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>73.900000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       Height_in_Inches\n",
       "count        200.000000\n",
       "mean          67.590508\n",
       "std            5.171622\n",
       "min            0.000000\n",
       "25%           66.500000\n",
       "50%           67.865000\n",
       "75%           69.200000\n",
       "max           73.900000"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "height_data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Zero Value Rows--> [158]\n"
     ]
    }
   ],
   "source": [
    "# Height cannot be 0 . Let's replace the 0 value with median. First Step, we have to find the 0 value row\n",
    "#Identifying the 0 value rows\n",
    "zero_value_row = list(height_data[height_data['Height_in_Inches'] == 0].index)\n",
    "print('Zero Value Rows-->', zero_value_row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Row 167 has 0 value. We will replace the 0 value with median. Weight cannot be 0.\n",
    "height_data['Height_in_Inches'][zero_value_row] = height_data['Height_in_Inches'].median()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "158    67.865\n",
       "Name: Height_in_Inches, dtype: float64"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Verify the replacement\n",
    "height_data['Height_in_Inches'][zero_value_row]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Height_in_Inches</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>200.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>67.929833</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>1.916551</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>63.430000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>66.522500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>67.882500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>69.200000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>73.900000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       Height_in_Inches\n",
       "count        200.000000\n",
       "mean          67.929833\n",
       "std            1.916551\n",
       "min           63.430000\n",
       "25%           66.522500\n",
       "50%           67.882500\n",
       "75%           69.200000\n",
       "max           73.900000"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "height_data.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Looks all set! All data cleaned."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fix Weight in Pounds Field"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "First 5 rows\n",
      "  Weight_in_Pounds\n",
      "0           112.99\n",
      "1           136.49\n",
      "2           153.03\n",
      "3           142.34\n",
      "4            144.3\n",
      "\n",
      "Variable information - \n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 200 entries, 0 to 199\n",
      "Data columns (total 1 columns):\n",
      "Weight_in_Pounds    198 non-null object\n",
      "dtypes: object(1)\n",
      "memory usage: 1.6+ KB\n",
      "None\n",
      "\n",
      "Getting Basic Statistics\n",
      "\n",
      "       Weight_in_Pounds\n",
      "count               198\n",
      "unique              194\n",
      "top              141.49\n",
      "freq                  2\n"
     ]
    }
   ],
   "source": [
    "#Creating a dataset for weight.\n",
    "weight_data = pd.DataFrame({'Weight_in_Pounds' : dataset['Weight_in_Pounds']})\n",
    "\n",
    "#printing first 5 rows\n",
    "print(\"First 5 rows\")\n",
    "print(weight_data.head())\n",
    "\n",
    "# Basic variable info\n",
    "print('\\nVariable information - ')\n",
    "print(weight_data.info())\n",
    "\n",
    "#Getting the statistics\n",
    "print(\"\\nGetting Basic Statistics\\n\")\n",
    "print(weight_data.describe(include=['O']))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Observation\n",
    "1. Weight_in_Pounds is non-numeric. This raises red flag. We are expecting weight to be a numeric field but it turns out to be object or non-numeric. We will assess this field more\n",
    "2. Look at the row - \"count\" . It contains 198 rows. 2 rows missing data\n",
    "3. Look at the row \"unique\" - It has 194 unique counts, meaning rest 6 rows must be having something different. We will find it out."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    Weight_in_Pounds\n",
      "100              NaN\n",
      "116                ?\n",
      "128                 \n",
      "151              NaN\n",
      "Rows with incorrect data -> [100, 116, 128, 151]\n"
     ]
    }
   ],
   "source": [
    "# Printing missing row and data\n",
    "print(weight_data[pd.to_numeric(weight_data['Weight_in_Pounds'], errors='coerce').isnull()])\n",
    "\n",
    "#Capturing missing rows into  a list\n",
    "rows = list(weight_data[pd.to_numeric(weight_data['Weight_in_Pounds'], errors='coerce').isnull()].index)\n",
    "print('Rows with incorrect data ->', rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "#We will replace will rows NaN/ Not NaN or any garbage data with NaN \n",
    "for row_num in rows :\n",
    "    weight_data['Weight_in_Pounds'][row_num] = np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Missing Value Rows--> [100, 116, 128, 151] \n",
      "Missing data in rows--> 4\n"
     ]
    }
   ],
   "source": [
    "#Identifying the rows containing missing data\n",
    "missing_value_row = list(weight_data[weight_data['Weight_in_Pounds'].isnull()].index)\n",
    "print('Missing Value Rows-->', missing_value_row , '\\nMissing data in rows-->', len(missing_value_row))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 200 entries, 0 to 199\n",
      "Data columns (total 1 columns):\n",
      "Weight_in_Pounds    196 non-null object\n",
      "dtypes: object(1)\n",
      "memory usage: 1.6+ KB\n"
     ]
    }
   ],
   "source": [
    "weight_data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "# We will convert the datatype of weight column from object to float. Thi is needed for imputer replacement.\n",
    "weight_data['Weight_in_Pounds'] = weight_data['Weight_in_Pounds'].astype(float)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Missing Value Rows--> [] \n",
      "Missing data in rows--> 0\n",
      "127.76\n",
      "127.76\n",
      "127.76\n",
      "127.76\n"
     ]
    }
   ],
   "source": [
    "#We wil replace the missing weight rows with median value\n",
    "imputer = Imputer(missing_values = 'NaN', strategy = 'median', axis = 0)  \n",
    "\n",
    "# Fit the imputer on the height column.\n",
    "weight_data['Weight_in_Pounds']= imputer.fit_transform(weight_data[['Weight_in_Pounds']])\n",
    "\n",
    "#Identifying the rows containing missing data\n",
    "missing_value_row = list(weight_data[weight_data['Weight_in_Pounds'].isnull()].index)\n",
    "print('Missing Value Rows-->', missing_value_row , '\\nMissing data in rows-->', len(missing_value_row))\n",
    "\n",
    "#Verify of the missing rows was replaced with mean\n",
    "print(weight_data['Weight_in_Pounds'][100])\n",
    "print(weight_data['Weight_in_Pounds'][116])\n",
    "print(weight_data['Weight_in_Pounds'][128])\n",
    "print(weight_data['Weight_in_Pounds'][151])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Finally now we have converted the datatype to float and replaced missing values with median. \n",
    "### Next we will again use describe() to evallute the field"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Weight_in_Pounds</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>200.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>126.585750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>14.803948</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>119.895000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>127.760000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>135.480000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>158.960000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       Weight_in_Pounds\n",
       "count        200.000000\n",
       "mean         126.585750\n",
       "std           14.803948\n",
       "min            0.000000\n",
       "25%          119.895000\n",
       "50%          127.760000\n",
       "75%          135.480000\n",
       "max          158.960000"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "weight_data.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Minimum is Still 0. We will replace the 0 value with median"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Zero Value Rows--> [167]\n"
     ]
    }
   ],
   "source": [
    "#Identifying the 0 value rows\n",
    "zero_value_row = list(weight_data[weight_data['Weight_in_Pounds'] == 0].index)\n",
    "print('Zero Value Rows-->', zero_value_row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Row 167 has 0 value. We will replace the 0 value with median. Weight cannot be 0.\n",
    "weight_data['Weight_in_Pounds'][zero_value_row] = weight_data['Weight_in_Pounds'].median()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "167    127.76\n",
       "Name: Weight_in_Pounds, dtype: float64"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Verify the replacement\n",
    "weight_data['Weight_in_Pounds'][zero_value_row]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Perfect! Now we will create the final  dataframe with clean and fixed data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "clean_data = pd.DataFrame({'Height_in_Inches' : height_data['Height_in_Inches'],'Weight_in_Pounds' : weight_data['Weight_in_Pounds'] })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Height_in_Inches</th>\n",
       "      <th>Weight_in_Pounds</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>65.78</td>\n",
       "      <td>112.99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>71.52</td>\n",
       "      <td>136.49</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>69.40</td>\n",
       "      <td>153.03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>68.22</td>\n",
       "      <td>142.34</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>67.79</td>\n",
       "      <td>144.30</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Height_in_Inches  Weight_in_Pounds\n",
       "0             65.78            112.99\n",
       "1             71.52            136.49\n",
       "2             69.40            153.03\n",
       "3             68.22            142.34\n",
       "4             67.79            144.30"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clean_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Height_in_Inches</th>\n",
       "      <th>Weight_in_Pounds</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>200.000000</td>\n",
       "      <td>200.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>67.929833</td>\n",
       "      <td>127.224550</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>1.916551</td>\n",
       "      <td>11.757181</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>63.430000</td>\n",
       "      <td>97.900000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>66.522500</td>\n",
       "      <td>119.997500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>67.882500</td>\n",
       "      <td>127.760000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>69.200000</td>\n",
       "      <td>135.480000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>73.900000</td>\n",
       "      <td>158.960000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       Height_in_Inches  Weight_in_Pounds\n",
       "count        200.000000        200.000000\n",
       "mean          67.929833        127.224550\n",
       "std            1.916551         11.757181\n",
       "min           63.430000         97.900000\n",
       "25%           66.522500        119.997500\n",
       "50%           67.882500        127.760000\n",
       "75%           69.200000        135.480000\n",
       "max           73.900000        158.960000"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clean_data.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data looks pretty clean. Kudos!"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}