Skip to content

Instantly share code, notes, and snippets.

@adityantamarapu
Created June 5, 2020 07:14
Show Gist options
  • Save adityantamarapu/6c3805df7ec27b46e336c8c83b79a8b3 to your computer and use it in GitHub Desktop.
Save adityantamarapu/6c3805df7ec27b46e336c8c83b79a8b3 to your computer and use it in GitHub Desktop.
Created on Skills Network Labs
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Getting the data and pouring it into a Dataframe\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"# import html5lib\n",
"# from bs4 import BeautifulSoup\n",
"df=pd.read_html(\"https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df=df[0]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Postal Code</th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>M1A</td>\n",
" <td>Not assigned</td>\n",
" <td>Not assigned</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>M2A</td>\n",
" <td>Not assigned</td>\n",
" <td>Not assigned</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M3A</td>\n",
" <td>North York</td>\n",
" <td>Parkwoods</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M4A</td>\n",
" <td>North York</td>\n",
" <td>Victoria Village</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>M5A</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Regent Park, Harbourfront</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Postal Code Borough Neighborhood\n",
"0 M1A Not assigned Not assigned\n",
"1 M2A Not assigned Not assigned\n",
"2 M3A North York Parkwoods\n",
"3 M4A North York Victoria Village\n",
"4 M5A Downtown Toronto Regent Park, Harbourfront"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cleaning the dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# # vc=df[\"Postal Code\"].value_counts()\n",
"# for i in vc:\n",
"# if vc[i]>1:\n",
" \n",
"# print( \"t\")\n",
"# else :\n",
"# print(\"no\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"df.reset_index(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"df.drop(\"index\",axis=1,inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Postal Code</th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>M1B</td>\n",
" <td>Scarborough</td>\n",
" <td>Malvern, Rouge</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>M1C</td>\n",
" <td>Scarborough</td>\n",
" <td>Rouge Hill, Port Union, Highland Creek</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M1E</td>\n",
" <td>Scarborough</td>\n",
" <td>Guildwood, Morningside, West Hill</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M1G</td>\n",
" <td>Scarborough</td>\n",
" <td>Woburn</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>M1H</td>\n",
" <td>Scarborough</td>\n",
" <td>Cedarbrae</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Postal Code Borough Neighborhood\n",
"0 M1B Scarborough Malvern, Rouge\n",
"1 M1C Scarborough Rouge Hill, Port Union, Highland Creek\n",
"2 M1E Scarborough Guildwood, Morningside, West Hill\n",
"3 M1G Scarborough Woburn\n",
"4 M1H Scarborough Cedarbrae"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(180, 3)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for i in (df['Postal Code'].value_counts() > 1):\n",
" if i==\"True\":\n",
" print(i)\n",
" else :\n",
" print(\"None\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"df=df[df.Borough!='Not assigned']\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(103, 3)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Postal Code 0\n",
"Borough 0\n",
"Neighborhood 0\n",
"dtype: int64"
]
},
"execution_count": 120,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.Neighborhood=='Not assigned'].count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get Latitude and Longitude"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Postal Code</th>\n",
" <th>Latitude</th>\n",
" <th>Longitude</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>M1B</td>\n",
" <td>43.806686</td>\n",
" <td>-79.194353</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>M1C</td>\n",
" <td>43.784535</td>\n",
" <td>-79.160497</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M1E</td>\n",
" <td>43.763573</td>\n",
" <td>-79.188711</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M1G</td>\n",
" <td>43.770992</td>\n",
" <td>-79.216917</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>M1H</td>\n",
" <td>43.773136</td>\n",
" <td>-79.239476</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>M9N</td>\n",
" <td>43.706876</td>\n",
" <td>-79.518188</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>M9P</td>\n",
" <td>43.696319</td>\n",
" <td>-79.532242</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100</th>\n",
" <td>M9R</td>\n",
" <td>43.688905</td>\n",
" <td>-79.554724</td>\n",
" </tr>\n",
" <tr>\n",
" <th>101</th>\n",
" <td>M9V</td>\n",
" <td>43.739416</td>\n",
" <td>-79.588437</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>M9W</td>\n",
" <td>43.706748</td>\n",
" <td>-79.594054</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>103 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" Postal Code Latitude Longitude\n",
"0 M1B 43.806686 -79.194353\n",
"1 M1C 43.784535 -79.160497\n",
"2 M1E 43.763573 -79.188711\n",
"3 M1G 43.770992 -79.216917\n",
"4 M1H 43.773136 -79.239476\n",
".. ... ... ...\n",
"98 M9N 43.706876 -79.518188\n",
"99 M9P 43.696319 -79.532242\n",
"100 M9R 43.688905 -79.554724\n",
"101 M9V 43.739416 -79.588437\n",
"102 M9W 43.706748 -79.594054\n",
"\n",
"[103 rows x 3 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#I failed to use geocoder. I tried a for loop on the while loop calling geocoder for every postal code.But , it didnt seem to work.\n",
"dfll=pd.read_csv(\"Geospatial_Coordinates.csv\")\n",
"dfll"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"df.sort_values(by=[\"Postal Code\"],inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Postal Code</th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" <th>Latitude</th>\n",
" <th>Longitude</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>M1B</td>\n",
" <td>Scarborough</td>\n",
" <td>Malvern, Rouge</td>\n",
" <td>43.806686</td>\n",
" <td>-79.194353</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>M1C</td>\n",
" <td>Scarborough</td>\n",
" <td>Rouge Hill, Port Union, Highland Creek</td>\n",
" <td>43.784535</td>\n",
" <td>-79.160497</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M1E</td>\n",
" <td>Scarborough</td>\n",
" <td>Guildwood, Morningside, West Hill</td>\n",
" <td>43.763573</td>\n",
" <td>-79.188711</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M1G</td>\n",
" <td>Scarborough</td>\n",
" <td>Woburn</td>\n",
" <td>43.770992</td>\n",
" <td>-79.216917</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>M1H</td>\n",
" <td>Scarborough</td>\n",
" <td>Cedarbrae</td>\n",
" <td>43.773136</td>\n",
" <td>-79.239476</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>M9N</td>\n",
" <td>York</td>\n",
" <td>Weston</td>\n",
" <td>43.706876</td>\n",
" <td>-79.518188</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>M9P</td>\n",
" <td>Etobicoke</td>\n",
" <td>Westmount</td>\n",
" <td>43.696319</td>\n",
" <td>-79.532242</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100</th>\n",
" <td>M9R</td>\n",
" <td>Etobicoke</td>\n",
" <td>Kingsview Village, St. Phillips, Martin Grove ...</td>\n",
" <td>43.688905</td>\n",
" <td>-79.554724</td>\n",
" </tr>\n",
" <tr>\n",
" <th>101</th>\n",
" <td>M9V</td>\n",
" <td>Etobicoke</td>\n",
" <td>South Steeles, Silverstone, Humbergate, Jamest...</td>\n",
" <td>43.739416</td>\n",
" <td>-79.588437</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>M9W</td>\n",
" <td>Etobicoke</td>\n",
" <td>Northwest, West Humber - Clairville</td>\n",
" <td>43.706748</td>\n",
" <td>-79.594054</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>103 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" Postal Code Borough \\\n",
"0 M1B Scarborough \n",
"1 M1C Scarborough \n",
"2 M1E Scarborough \n",
"3 M1G Scarborough \n",
"4 M1H Scarborough \n",
".. ... ... \n",
"98 M9N York \n",
"99 M9P Etobicoke \n",
"100 M9R Etobicoke \n",
"101 M9V Etobicoke \n",
"102 M9W Etobicoke \n",
"\n",
" Neighborhood Latitude Longitude \n",
"0 Malvern, Rouge 43.806686 -79.194353 \n",
"1 Rouge Hill, Port Union, Highland Creek 43.784535 -79.160497 \n",
"2 Guildwood, Morningside, West Hill 43.763573 -79.188711 \n",
"3 Woburn 43.770992 -79.216917 \n",
"4 Cedarbrae 43.773136 -79.239476 \n",
".. ... ... ... \n",
"98 Weston 43.706876 -79.518188 \n",
"99 Westmount 43.696319 -79.532242 \n",
"100 Kingsview Village, St. Phillips, Martin Grove ... 43.688905 -79.554724 \n",
"101 South Steeles, Silverstone, Humbergate, Jamest... 43.739416 -79.588437 \n",
"102 Northwest, West Humber - Clairville 43.706748 -79.594054 \n",
"\n",
"[103 rows x 5 columns]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df3=pd.concat([df,dfll['Latitude'],dfll['Longitude']],axis=1)\n",
"df3"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python",
"language": "python",
"name": "conda-env-python-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment