Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save Teguh010/dd79348023424a627e802d82f7bc4307 to your computer and use it in GitHub Desktop.
Save Teguh010/dd79348023424a627e802d82f7bc4307 to your computer and use it in GitHub Desktop.
Created on Cognitive Class Labs
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Clustering and Segmentation Neighborhood in Toronto <a id=\"10\"></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Part 1, Data Scraping <a id=\"0\"></a>"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"import urllib3.request\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.cluster import KMeans\n",
"from geopy.geocoders import Nominatim\n",
"import folium\n",
"import os\n",
"import requests\n",
"import json\n",
"from pandas.io.json import json_normalize\n",
"import matplotlib.cm as cm\n",
"import matplotlib.colors as colors"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/jupyterlab/conda/lib/python3.6/site-packages/urllib3/connectionpool.py:847: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings\n",
" InsecureRequestWarning)\n"
]
}
],
"source": [
"page_url = \"https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M\"\n",
"proxy_url = \"\"\n",
"if proxy_url.strip() != \"\":\n",
" # using proxy\n",
" http = urllib3.ProxyManager(proxy_url)\n",
"else:\n",
" # direct internet connection\n",
" http = urllib3.PoolManager()\n",
"\n",
"req = http.request('GET', page_url)\n",
"soup = BeautifulSoup(req.data, 'html.parser')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PostalCode</th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>M3A</td>\n",
" <td>North York</td>\n",
" <td>Parkwoods</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>M4A</td>\n",
" <td>North York</td>\n",
" <td>Victoria Village</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M5A</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Harbourfront</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M5A</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Regent Park</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>M6A</td>\n",
" <td>North York</td>\n",
" <td>Lawrence Heights</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PostalCode Borough Neighborhood\n",
"0 M3A North York Parkwoods\n",
"1 M4A North York Victoria Village\n",
"2 M5A Downtown Toronto Harbourfront\n",
"3 M5A Downtown Toronto Regent Park\n",
"4 M6A North York Lawrence Heights"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"\n",
"# locate postcode table\n",
"toronto_table = soup.find('table',{'class':'wikitable sortable'})\n",
"\n",
"# process table rows and build raw_df\n",
"raw_df = pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighborhood'])\n",
"rows = toronto_table.findAll('tr')\n",
"for row in rows:\n",
" row_items = row.findAll('td')\n",
" if len(row_items) > 0:\n",
" postcode = row_items[0].text.strip()\n",
" borough = row_items[1].text.strip()\n",
" if borough.lower() != \"not assigned\":\n",
" neighborhood = row_items[2].text.strip()\n",
" raw_df = raw_df.append({'PostalCode':postcode, \n",
" 'Borough':borough, \n",
" 'Neighborhood':neighborhood}, \n",
" ignore_index = True)\n",
"\n",
"raw_df.head()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(103, 3)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PostalCode</th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>M1B</td>\n",
" <td>Scarborough</td>\n",
" <td>Rouge, Malvern</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>M1C</td>\n",
" <td>Scarborough</td>\n",
" <td>Highland Creek, Rouge Hill, Port Union</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M1E</td>\n",
" <td>Scarborough</td>\n",
" <td>Guildwood, Morningside, West Hill</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M1G</td>\n",
" <td>Scarborough</td>\n",
" <td>Woburn</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>M1H</td>\n",
" <td>Scarborough</td>\n",
" <td>Cedarbrae</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PostalCode Borough Neighborhood\n",
"0 M1B Scarborough Rouge, Malvern\n",
"1 M1C Scarborough Highland Creek, Rouge Hill, Port Union\n",
"2 M1E Scarborough Guildwood, Morningside, West Hill\n",
"3 M1G Scarborough Woburn\n",
"4 M1H Scarborough Cedarbrae"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"grouped = []\n",
"for name, group in raw_df.groupby(['PostalCode', 'Borough'])['Neighborhood']:\n",
" nblist = ''.join(str(x) + \", \" for x in group.tolist()).strip(\", \")\n",
" if nblist == \"Not assigned\":\n",
" nblist = name[1]\n",
" grouped.append((name[0], name[1], nblist))\n",
"\n",
"toronto_df = pd.DataFrame(grouped, columns=['PostalCode', 'Borough', 'Neighborhood'])\n",
"print(toronto_df.shape)\n",
"toronto_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PostalCode</th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>M9N</td>\n",
" <td>York</td>\n",
" <td>Weston</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>M9P</td>\n",
" <td>Etobicoke</td>\n",
" <td>Westmount</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100</th>\n",
" <td>M9R</td>\n",
" <td>Etobicoke</td>\n",
" <td>Kingsview Village, Martin Grove Gardens, Richv...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>101</th>\n",
" <td>M9V</td>\n",
" <td>Etobicoke</td>\n",
" <td>Albion Gardens, Beaumond Heights, Humbergate, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>M9W</td>\n",
" <td>Etobicoke</td>\n",
" <td>Northwest</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PostalCode Borough Neighborhood\n",
"98 M9N York Weston\n",
"99 M9P Etobicoke Westmount\n",
"100 M9R Etobicoke Kingsview Village, Martin Grove Gardens, Richv...\n",
"101 M9V Etobicoke Albion Gardens, Beaumond Heights, Humbergate, ...\n",
"102 M9W Etobicoke Northwest"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"toronto_df.tail()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PostalCode</th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [PostalCode, Borough, Neighborhood]\n",
"Index: []"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"\n",
"# just for verification. This query should return no rows.\n",
"toronto_df.query(\"Neighborhood == 'Not assigned'\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PostalCode</th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>85</th>\n",
" <td>M7A</td>\n",
" <td>Queen's Park</td>\n",
" <td>Queen's Park</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PostalCode Borough Neighborhood\n",
"85 M7A Queen's Park Queen's Park"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"toronto_df.query(\"PostalCode == 'M7A'\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(103, 3)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"toronto_df.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment