Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save codebrain001/9a71752689d98cffd4eff03afe27d123 to your computer and use it in GitHub Desktop.
Save codebrain001/9a71752689d98cffd4eff03afe27d123 to your computer and use it in GitHub Desktop.
For Medium article
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from scipy import stats \n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"dataset = pd.read_csv('housing.csv') #reading data set"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>longitude</th>\n",
" <th>latitude</th>\n",
" <th>housing_median_age</th>\n",
" <th>total_rooms</th>\n",
" <th>total_bedrooms</th>\n",
" <th>population</th>\n",
" <th>households</th>\n",
" <th>median_income</th>\n",
" <th>median_house_value</th>\n",
" <th>ocean_proximity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>-122.23</td>\n",
" <td>37.88</td>\n",
" <td>41.0</td>\n",
" <td>880.0</td>\n",
" <td>129.0</td>\n",
" <td>322.0</td>\n",
" <td>126.0</td>\n",
" <td>8.3252</td>\n",
" <td>452600.0</td>\n",
" <td>NEAR BAY</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>-122.22</td>\n",
" <td>37.86</td>\n",
" <td>21.0</td>\n",
" <td>7099.0</td>\n",
" <td>1106.0</td>\n",
" <td>2401.0</td>\n",
" <td>1138.0</td>\n",
" <td>8.3014</td>\n",
" <td>358500.0</td>\n",
" <td>NEAR BAY</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>-122.24</td>\n",
" <td>37.85</td>\n",
" <td>52.0</td>\n",
" <td>1467.0</td>\n",
" <td>190.0</td>\n",
" <td>496.0</td>\n",
" <td>177.0</td>\n",
" <td>7.2574</td>\n",
" <td>352100.0</td>\n",
" <td>NEAR BAY</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>-122.25</td>\n",
" <td>37.85</td>\n",
" <td>52.0</td>\n",
" <td>1274.0</td>\n",
" <td>235.0</td>\n",
" <td>558.0</td>\n",
" <td>219.0</td>\n",
" <td>5.6431</td>\n",
" <td>341300.0</td>\n",
" <td>NEAR BAY</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-122.25</td>\n",
" <td>37.85</td>\n",
" <td>52.0</td>\n",
" <td>1627.0</td>\n",
" <td>280.0</td>\n",
" <td>565.0</td>\n",
" <td>259.0</td>\n",
" <td>3.8462</td>\n",
" <td>342200.0</td>\n",
" <td>NEAR BAY</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" longitude latitude housing_median_age total_rooms total_bedrooms \\\n",
"0 -122.23 37.88 41.0 880.0 129.0 \n",
"1 -122.22 37.86 21.0 7099.0 1106.0 \n",
"2 -122.24 37.85 52.0 1467.0 190.0 \n",
"3 -122.25 37.85 52.0 1274.0 235.0 \n",
"4 -122.25 37.85 52.0 1627.0 280.0 \n",
"\n",
" population households median_income median_house_value ocean_proximity \n",
"0 322.0 126.0 8.3252 452600.0 NEAR BAY \n",
"1 2401.0 1138.0 8.3014 358500.0 NEAR BAY \n",
"2 496.0 177.0 7.2574 352100.0 NEAR BAY \n",
"3 558.0 219.0 5.6431 341300.0 NEAR BAY \n",
"4 565.0 259.0 3.8462 342200.0 NEAR BAY "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.head() #peeking to see what the data set is like"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def convert_array_df(measure, col):\n",
" '''\n",
" This function converts the inputed the array to a rectangular data structure(dataframe), so we can appreciate \n",
" the results better. This function accepts two parameters\n",
" 1. Array of the measure\n",
" 2. Columns\n",
" '''\n",
" results = {\n",
" 'Features': col,\n",
" 'Measure' : measure\n",
" }\n",
" results = pd.DataFrame(results)\n",
" return results"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def only_num(data):\n",
" '''\n",
" This function aids the extraction of the numeric values and columns\n",
" '''\n",
" numerical_values = data.select_dtypes(exclude=['object'])\n",
" numerical_columns = data.select_dtypes(exclude=['object']).columns\n",
" return numerical_values, numerical_columns\n",
" \n",
" \n",
"def only_cat(data):\n",
" '''\n",
" This function aids the extraction of the categorical values and columns\n",
" '''\n",
" categorical_values = data.select_dtypes(include=['object'])\n",
" categorical_columns = data.select_dtypes(include=['object']).columns\n",
" return categorical_values, categorical_columns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Measures of central tendency to be considered\n",
"- Mean\n",
"- Trimmed Mean\n",
"- Weighted Mean\n",
"- Median\n",
"- Weighted Median \n",
"- Mode"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Mean"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def get_mean(data):\n",
" '''\n",
" This function calculates the mean of the numeric variables in the data set. It accepts only one parameter\n",
" 1. Dataset\n",
" '''\n",
" num,col = only_num(data)\n",
" array = np.array(num)\n",
" n_mean = np.nanmean(array, axis=0) \n",
" return convert_array_df(n_mean, col)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Features</th>\n",
" <th>Measure</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>longitude</td>\n",
" <td>-119.569704</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>latitude</td>\n",
" <td>35.631861</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>housing_median_age</td>\n",
" <td>28.639486</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>total_rooms</td>\n",
" <td>2635.763081</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>total_bedrooms</td>\n",
" <td>537.870553</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>population</td>\n",
" <td>1425.476744</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>households</td>\n",
" <td>499.539680</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>median_income</td>\n",
" <td>3.870671</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>median_house_value</td>\n",
" <td>206855.816909</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Features Measure\n",
"0 longitude -119.569704\n",
"1 latitude 35.631861\n",
"2 housing_median_age 28.639486\n",
"3 total_rooms 2635.763081\n",
"4 total_bedrooms 537.870553\n",
"5 population 1425.476744\n",
"6 households 499.539680\n",
"7 median_income 3.870671\n",
"8 median_house_value 206855.816909"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#calulcating the mean of the dataset\n",
"get_mean(dataset)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Trimmed Mean"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def get_trimmed_mean(data, truncation):\n",
" '''\n",
" This function calculates the trimmed mean and makes use of the scipy library, the truncation represents the \n",
" portion (in percentage) of the extreme values that is desired to be removed. It accepts two parameters\n",
" 1. Dataset\n",
" 2. Truncation value\n",
" '''\n",
" num,col = only_num(data)\n",
" array = np.array(num)\n",
" tmean = stats.trim_mean(array, truncation)\n",
" return convert_array_df(tmean, col)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Features</th>\n",
" <th>Measure</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>longitude</td>\n",
" <td>-119.518129</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>latitude</td>\n",
" <td>35.508249</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>housing_median_age</td>\n",
" <td>28.494549</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>total_rooms</td>\n",
" <td>2294.557837</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>total_bedrooms</td>\n",
" <td>477.576248</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>population</td>\n",
" <td>1256.512900</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>households</td>\n",
" <td>441.201793</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>median_income</td>\n",
" <td>3.654012</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>median_house_value</td>\n",
" <td>192772.995397</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Features Measure\n",
"0 longitude -119.518129\n",
"1 latitude 35.508249\n",
"2 housing_median_age 28.494549\n",
"3 total_rooms 2294.557837\n",
"4 total_bedrooms 477.576248\n",
"5 population 1256.512900\n",
"6 households 441.201793\n",
"7 median_income 3.654012\n",
"8 median_house_value 192772.995397"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#calulcating the trimmed mean of the dataset\n",
"get_trimmed_mean(dataset,0.1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Weighted Mean"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def get_weighted_mean(data, weight_name):\n",
" '''\n",
" This function calculates the weighted mean and makes use on masked arrays. it accepts two parameters\n",
" 1. Dataset\n",
" 2. Name of feature desired to be used as weight as a string\n",
" '''\n",
" num,col = only_num(data)\n",
" col = data.select_dtypes(exclude=['object']).columns \n",
" weight = data[weight_name]\n",
" wt_avg = []\n",
" for item in col:\n",
" if item == weight_name:\n",
" ma = np.ma.MaskedArray(data[item], mask=np.isnan((data[item])))\n",
" w_avg = np.ma.average(ma, axis=0, weights=weight)\n",
" ### This tends to calculate the weighted mean on the weights, so to correct this we divide by 2\n",
" w_avg = w_avg/2\n",
" wt_avg.append(w_avg)\n",
" else:\n",
" ma = np.ma.MaskedArray(data[item], mask=np.isnan((data[item])))\n",
" w_avg = np.ma.average(ma, axis =0, weights=weight)\n",
" wt_avg.append(w_avg)\n",
" \n",
" return convert_array_df(wt_avg,col) "
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Features</th>\n",
" <th>Measure</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>longitude</td>\n",
" <td>-119.410904</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>latitude</td>\n",
" <td>35.447274</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>housing_median_age</td>\n",
" <td>25.677624</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>total_rooms</td>\n",
" <td>4121.237998</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>total_bedrooms</td>\n",
" <td>831.999819</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>population</td>\n",
" <td>1162.555715</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>households</td>\n",
" <td>775.085802</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>median_income</td>\n",
" <td>3.877967</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>median_house_value</td>\n",
" <td>204596.156992</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Features Measure\n",
"0 longitude -119.410904\n",
"1 latitude 35.447274\n",
"2 housing_median_age 25.677624\n",
"3 total_rooms 4121.237998\n",
"4 total_bedrooms 831.999819\n",
"5 population 1162.555715\n",
"6 households 775.085802\n",
"7 median_income 3.877967\n",
"8 median_house_value 204596.156992"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#calulcating the weighted mean of the dataset and using the feature 'population' as the weight\n",
"get_weighted_mean(dataset, 'population')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Median"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"def get_median(data):\n",
" '''\n",
" This function calculates the median of the numeric variables in the data set. It accepts only one parameter\n",
" 1. Dataset\n",
" '''\n",
" num,col = only_num(data)\n",
" array = np.array(num)\n",
" n_median = np.nanmedian(array, axis=0)\n",
" return convert_array_df(n_median, col)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Features</th>\n",
" <th>Measure</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>longitude</td>\n",
" <td>-118.4900</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>latitude</td>\n",
" <td>34.2600</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>housing_median_age</td>\n",
" <td>29.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>total_rooms</td>\n",
" <td>2127.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>total_bedrooms</td>\n",
" <td>435.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>population</td>\n",
" <td>1166.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>households</td>\n",
" <td>409.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>median_income</td>\n",
" <td>3.5348</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>median_house_value</td>\n",
" <td>179700.0000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Features Measure\n",
"0 longitude -118.4900\n",
"1 latitude 34.2600\n",
"2 housing_median_age 29.0000\n",
"3 total_rooms 2127.0000\n",
"4 total_bedrooms 435.0000\n",
"5 population 1166.0000\n",
"6 households 409.0000\n",
"7 median_income 3.5348\n",
"8 median_house_value 179700.0000"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#calulcating the median of the dataset\n",
"get_median(dataset)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Weighted Median"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"def get_weighted_median(data, weight_name):\n",
" '''\n",
" This function calculates weighted median, it makes use of wquantile library. it accepts only two parameter\n",
" 1. Dataset\n",
" 2. Name of feature desired to be used as weight as a string\n",
" '''\n",
" # ! pip install wquantiles\n",
" import weighted\n",
" \n",
" num,col = only_num(data)\n",
" weight_name = weight_name.lower()\n",
" weight = data[weight_name]\n",
" wt_median = []\n",
" \n",
" for item in col:\n",
" if item == weight_name:\n",
" w_median = weighted.median(data[item], weights=weight)\n",
" w_median = w_median/2\n",
" wt_median.append(w_median)\n",
" else:\n",
" w_median = weighted.median(data[item], weights=weight)\n",
" wt_median.append(w_median)\n",
" \n",
" return convert_array_df(wt_median, col)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Features</th>\n",
" <th>Measure</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>longitude</td>\n",
" <td>-119.410904</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>latitude</td>\n",
" <td>35.447274</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>housing_median_age</td>\n",
" <td>25.677624</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>total_rooms</td>\n",
" <td>4121.237998</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>total_bedrooms</td>\n",
" <td>831.999819</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>population</td>\n",
" <td>1162.555715</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>households</td>\n",
" <td>775.085802</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>median_income</td>\n",
" <td>3.877967</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>median_house_value</td>\n",
" <td>204596.156992</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Features Measure\n",
"0 longitude -119.410904\n",
"1 latitude 35.447274\n",
"2 housing_median_age 25.677624\n",
"3 total_rooms 4121.237998\n",
"4 total_bedrooms 831.999819\n",
"5 population 1162.555715\n",
"6 households 775.085802\n",
"7 median_income 3.877967\n",
"8 median_house_value 204596.156992"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#calulcating the weighted_median of the dataset\n",
"get_weighted_mean(dataset, 'population')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Mode"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def get_mode(data):\n",
" '''\n",
" This function calculates the mode of the categorical features, it makes use of wquantile library. it accepts only one parameter\n",
" 1. Dataset\n",
" '''\n",
" cat,col = only_cat(data)\n",
" n_mode = stats.mode(cat)\n",
" return n_mode"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"ModeResult(mode=array([['<1H OCEAN']], dtype=object), count=array([[9136]]))"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#calulcating the mode of the dataset\n",
"get_mode(dataset)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Measure of Dispersion\n",
"- Range\n",
"- Interquartile range\n",
"- Mean absolute deviation\n",
"- Variance\n",
"- Standard deviation \n",
"- Median absolute deviation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Range"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"def get_range(data):\n",
" '''\n",
" This function calculates the range of the numeric variables in the data set. It accepts only one parameter\n",
" 1. Dataset\n",
" '''\n",
" \n",
" num, col = only_num(data)\n",
" \n",
" results = []\n",
" for item in col:\n",
" item_range = []\n",
" for value in data[item]:\n",
" item_range.append(value)\n",
" limit1 = min(item_range)\n",
" limit2 = max(item_range)\n",
" range_ = limit2- limit1\n",
" results.append(range_)\n",
" return convert_array_df(results, col)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Features</th>\n",
" <th>Measure</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>longitude</td>\n",
" <td>10.0400</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>latitude</td>\n",
" <td>9.4100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>housing_median_age</td>\n",
" <td>51.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>total_rooms</td>\n",
" <td>39318.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>total_bedrooms</td>\n",
" <td>6444.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>population</td>\n",
" <td>35679.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>households</td>\n",
" <td>6081.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>median_income</td>\n",
" <td>14.5002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>median_house_value</td>\n",
" <td>485002.0000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Features Measure\n",
"0 longitude 10.0400\n",
"1 latitude 9.4100\n",
"2 housing_median_age 51.0000\n",
"3 total_rooms 39318.0000\n",
"4 total_bedrooms 6444.0000\n",
"5 population 35679.0000\n",
"6 households 6081.0000\n",
"7 median_income 14.5002\n",
"8 median_house_value 485002.0000"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#calulcating the range of the dataset\n",
"get_range(dataset)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Interquartile range"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"def get_IQR(data):\n",
" '''\n",
" This function calculates the Interquartile range of the numeric variables in the data set. It accepts only one parameter\n",
" 1. Dataset\n",
" '''\n",
" num,col = only_num(data)\n",
" results = []\n",
" for item in col:\n",
" result = stats.iqr(data[item],nan_policy='omit') # The NaN defines how to handle when input contains nan.\n",
" results.append(result)\n",
" return convert_array_df(results, col)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Features</th>\n",
" <th>Measure</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>longitude</td>\n",
" <td>3.79000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>latitude</td>\n",
" <td>3.78000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>housing_median_age</td>\n",
" <td>19.00000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>total_rooms</td>\n",
" <td>1700.25000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>total_bedrooms</td>\n",
" <td>351.00000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>population</td>\n",
" <td>938.00000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>households</td>\n",
" <td>325.00000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>median_income</td>\n",
" <td>2.17985</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>median_house_value</td>\n",
" <td>145125.00000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Features Measure\n",
"0 longitude 3.79000\n",
"1 latitude 3.78000\n",
"2 housing_median_age 19.00000\n",
"3 total_rooms 1700.25000\n",
"4 total_bedrooms 351.00000\n",
"5 population 938.00000\n",
"6 households 325.00000\n",
"7 median_income 2.17985\n",
"8 median_house_value 145125.00000"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#calulcating the IQR of the dataset\n",
"get_IQR(dataset)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Mean absolute deviation"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"def mean_absolute_dev(data):\n",
" '''\n",
" This function calculates the Mean absolute deviation of the numeric variables in the data set. It accepts only one parameter\n",
" 1. Dataset\n",
" '''\n",
" num,col = only_num(data)\n",
" results = []\n",
" for item in col:\n",
" ### computing the formula 'mean(abs(data-mean(data)))'\n",
" a = np.nanmean(np.array(data[item]))\n",
" b = np.array(data[item])\n",
" result = np.nanmean(np.absolute(b-a))\n",
" results.append(result)\n",
" return convert_array_df(results, col)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Features</th>\n",
" <th>Measure</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>longitude</td>\n",
" <td>1.830206</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>latitude</td>\n",
" <td>1.975024</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>housing_median_age</td>\n",
" <td>10.551539</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>total_rooms</td>\n",
" <td>1344.462236</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>total_bedrooms</td>\n",
" <td>270.923606</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>population</td>\n",
" <td>714.237277</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>households</td>\n",
" <td>247.195367</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>median_income</td>\n",
" <td>1.401614</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>median_house_value</td>\n",
" <td>91170.439944</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Features Measure\n",
"0 longitude 1.830206\n",
"1 latitude 1.975024\n",
"2 housing_median_age 10.551539\n",
"3 total_rooms 1344.462236\n",
"4 total_bedrooms 270.923606\n",
"5 population 714.237277\n",
"6 households 247.195367\n",
"7 median_income 1.401614\n",
"8 median_house_value 91170.439944"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#calulcating the Mean absolute deviation of the dataset\n",
"mean_absolute_dev(dataset)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Variance"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"def get_var(data):\n",
" '''\n",
" This function calculates the Variance of the numeric variables in the data set. It accepts only one parameter\n",
" 1. Dataset\n",
" '''\n",
" num,col = only_num(data)\n",
" array = np.array(num)\n",
" var = np.nanvar(array, axis=0) \n",
" return convert_array_df(var, col)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Features</th>\n",
" <th>Measure</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>longitude</td>\n",
" <td>4.013945e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>latitude</td>\n",
" <td>4.562072e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>housing_median_age</td>\n",
" <td>1.583886e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>total_rooms</td>\n",
" <td>4.759215e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>total_bedrooms</td>\n",
" <td>1.775567e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>population</td>\n",
" <td>1.282408e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>households</td>\n",
" <td>1.461690e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>median_income</td>\n",
" <td>3.609148e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>median_house_value</td>\n",
" <td>1.331550e+10</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Features Measure\n",
"0 longitude 4.013945e+00\n",
"1 latitude 4.562072e+00\n",
"2 housing_median_age 1.583886e+02\n",
"3 total_rooms 4.759215e+06\n",
"4 total_bedrooms 1.775567e+05\n",
"5 population 1.282408e+06\n",
"6 households 1.461690e+05\n",
"7 median_income 3.609148e+00\n",
"8 median_house_value 1.331550e+10"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#calulcating the Variance of the dataset\n",
"get_var(dataset)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Standard deviation"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"def get_std(data):\n",
" '''\n",
" This function calculates the Standard deviation of the numeric variables in the data set. It accepts only one parameter\n",
" 1. Dataset\n",
" '''\n",
" num,col = only_num(data)\n",
" array = np.array(num)\n",
" std = np.nanstd(array, axis=0) \n",
" return convert_array_df(std, col)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Features</th>\n",
" <th>Measure</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>longitude</td>\n",
" <td>2.003483</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>latitude</td>\n",
" <td>2.135901</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>housing_median_age</td>\n",
" <td>12.585253</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>total_rooms</td>\n",
" <td>2181.562402</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>total_bedrooms</td>\n",
" <td>421.374759</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>population</td>\n",
" <td>1132.434688</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>households</td>\n",
" <td>382.320491</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>median_income</td>\n",
" <td>1.899776</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>median_house_value</td>\n",
" <td>115392.820404</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Features Measure\n",
"0 longitude 2.003483\n",
"1 latitude 2.135901\n",
"2 housing_median_age 12.585253\n",
"3 total_rooms 2181.562402\n",
"4 total_bedrooms 421.374759\n",
"5 population 1132.434688\n",
"6 households 382.320491\n",
"7 median_income 1.899776\n",
"8 median_house_value 115392.820404"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#calulcating the Standard deviation of the dataset\n",
"get_std(dataset)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Median absolute deviation (MAD)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"def get_MAD(data):\n",
" '''\n",
" This function calculates the Median absolute deviation of the numeric variables in the data set. It accepts only one parameter\n",
" 1. Dataset\n",
" '''\n",
" num,col = only_num(data)\n",
" results = []\n",
" for item in col:\n",
" result = stats.median_absolute_deviation(data[item],nan_policy='omit') # The NaN defines how to handle when input contains nan.\n",
" results.append(result)\n",
" return convert_array_df(results, col)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Features</th>\n",
" <th>Measure</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>longitude</td>\n",
" <td>1.897728</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>latitude</td>\n",
" <td>1.823598</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>housing_median_age</td>\n",
" <td>14.826000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>total_rooms</td>\n",
" <td>1181.632200</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>total_bedrooms</td>\n",
" <td>240.181200</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>population</td>\n",
" <td>652.344000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>households</td>\n",
" <td>223.872600</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>median_income</td>\n",
" <td>1.577783</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>median_house_value</td>\n",
" <td>101409.840000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Features Measure\n",
"0 longitude 1.897728\n",
"1 latitude 1.823598\n",
"2 housing_median_age 14.826000\n",
"3 total_rooms 1181.632200\n",
"4 total_bedrooms 240.181200\n",
"5 population 652.344000\n",
"6 households 223.872600\n",
"7 median_income 1.577783\n",
"8 median_house_value 101409.840000"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#calulcating the Median absolute deviation of the dataset\n",
"get_MAD(dataset)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment