dcasmr/BugsReportStatisticsModule.ipynb Secret

## BugsReportStatisticsModule.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import statistics as stats"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## BUG Reported: NaN Values can cause incorrect computations of the median <br> using STATISTICS library functions\n",
    "   > 1. median\n",
    "   > 2. median_high\n",
    "   > 3. median_low"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data = [75, 90,85, 92, 95, 80, np.nan]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[75, 90, 85, 92, 95, 80, nan]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calculate the median when there is a missing value in data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Median = stats.median(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calculate the median high when there is a missing value in data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Median_high = stats.median_high(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calculate the median low when there is a missing value in data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Median_low = stats.median_low(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### All these computed values are incorrect due to BUG !"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The incorrect median is: 90\n"
     ]
    }
   ],
   "source": [
    "print(\"The incorrect median is:\", Median)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The incorrect median high value is: 90\n"
     ]
    }
   ],
   "source": [
    "print(\"The incorrect median high value is:\", Median_high)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The incorrect median low value is: 90\n"
     ]
    }
   ],
   "source": [
    "print(\"The incorrect median low value is:\", Median_high)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read data into Pandas and show that Pandas computation of the Median is CORRECT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df = pd.DataFrame(data, columns=['data'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>data</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>75.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>90.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>85.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>92.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>95.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>80.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   data\n",
       "0  75.0\n",
       "1  90.0\n",
       "2  85.0\n",
       "3  92.0\n",
       "4  95.0\n",
       "5  80.0\n",
       "6   NaN"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Compute the median using pandas library"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "Median_correct = df['data'].median()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The correct median is 87.5\n"
     ]
    }
   ],
   "source": [
    "print(\"The correct median is\", Median_correct)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Solution for properly computing median with numpy when NaN are present <br> Create data2 by dropping NaN values from list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data2 = [x for x in data if str(x) != 'nan']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[75, 90, 85, 92, 95, 80]"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Now with missing values removed compute the median"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "87.5"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stats.median(data2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Now with missing values removed compute the median low"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "85"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stats.median_low(data2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Now with missing values removed compute the median high"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "90"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stats.median_high(data2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Best way to fix the bug in Python? <br> Just report nan when data has missing values<br> Why??? To make it behave like the mean, harmonic_mean and stdev functions<br> for consistency"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Let's try to compute the mean, harmonic mean and standard deviation <br> with statistics library when missing values are present"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Compute the mean of the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Mean = stats.mean(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The mean of the data is nan\n"
     ]
    }
   ],
   "source": [
    "print(\"The mean of the data is\", Mean)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Compute the harmonic mean of the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Harmonic_Mean = stats.harmonic_mean(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The harmonic mean of the data is nan\n"
     ]
    }
   ],
   "source": [
    "print(\"The harmonic mean of the data is\", Harmonic_Mean)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Compute the sample standard deviation of the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "Standard_dev = stats.stdev(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The standard deviation of the data is nan\n"
     ]
    }
   ],
   "source": [
    "print(\"The standard deviation of the data is\", Standard_dev)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"import statistics as stats"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## BUG Reported: NaN Values can cause incorrect computations of the median <br> using STATISTICS library functions\n",
	" > 1. median\n",
	" > 2. median_high\n",
	" > 3. median_low"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"data = [75, 90,85, 92, 95, 80, np.nan]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[75, 90, 85, 92, 95, 80, nan]"
	]
	},
	"execution_count": 18,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"data"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Calculate the median when there is a missing value in data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"Median = stats.median(data)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Calculate the median high when there is a missing value in data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"Median_high = stats.median_high(data)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Calculate the median low when there is a missing value in data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"Median_low = stats.median_low(data)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### All these computed values are incorrect due to BUG !"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The incorrect median is: 90\n"
	]
	}
	],
	"source": [
	"print(\"The incorrect median is:\", Median)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The incorrect median high value is: 90\n"
	]
	}
	],
	"source": [
	"print(\"The incorrect median high value is:\", Median_high)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The incorrect median low value is: 90\n"
	]
	}
	],
	"source": [
	"print(\"The incorrect median low value is:\", Median_high)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Read data into Pandas and show that Pandas computation of the Median is CORRECT"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"df = pd.DataFrame(data, columns=['data'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style>\n",
	" .dataframe thead tr:only-child th {\n",
	" text-align: right;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: left;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>data</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>75.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>90.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>85.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>92.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>95.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>5</th>\n",
	" <td>80.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>6</th>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" data\n",
	"0 75.0\n",
	"1 90.0\n",
	"2 85.0\n",
	"3 92.0\n",
	"4 95.0\n",
	"5 80.0\n",
	"6 NaN"
	]
	},
	"execution_count": 26,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df.head(10)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Compute the median using pandas library"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {},
	"outputs": [],
	"source": [
	"Median_correct = df['data'].median()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The correct median is 87.5\n"
	]
	}
	],
	"source": [
	"print(\"The correct median is\", Median_correct)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Solution for properly computing median with numpy when NaN are present <br> Create data2 by dropping NaN values from list"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 41,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"data2 = [x for x in data if str(x) != 'nan']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 42,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[75, 90, 85, 92, 95, 80]"
	]
	},
	"execution_count": 42,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"data2"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Now with missing values removed compute the median"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 43,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"87.5"
	]
	},
	"execution_count": 43,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"stats.median(data2)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Now with missing values removed compute the median low"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 44,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"85"
	]
	},
	"execution_count": 44,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"stats.median_low(data2)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Now with missing values removed compute the median high"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 47,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"90"
	]
	},
	"execution_count": 47,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"stats.median_high(data2)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Best way to fix the bug in Python? <br> Just report nan when data has missing values<br> Why??? To make it behave like the mean, harmonic_mean and stdev functions<br> for consistency"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Let's try to compute the mean, harmonic mean and standard deviation <br> with statistics library when missing values are present"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Compute the mean of the data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"Mean = stats.mean(data)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The mean of the data is nan\n"
	]
	}
	],
	"source": [
	"print(\"The mean of the data is\", Mean)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Compute the harmonic mean of the data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 31,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"Harmonic_Mean = stats.harmonic_mean(data)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 32,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The harmonic mean of the data is nan\n"
	]
	}
	],
	"source": [
	"print(\"The harmonic mean of the data is\", Harmonic_Mean)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Compute the sample standard deviation of the data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 49,
	"metadata": {},
	"outputs": [],
	"source": [
	"Standard_dev = stats.stdev(data)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 50,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The standard deviation of the data is nan\n"
	]
	}
	],
	"source": [
	"print(\"The standard deviation of the data is\", Standard_dev)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.4"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}