Skip to content

Instantly share code, notes, and snippets.

@sofia100
Created May 22, 2020 03:10
Show Gist options
  • Save sofia100/b1e011d38fad28acce4d04d329dafc41 to your computer and use it in GitHub Desktop.
Save sofia100/b1e011d38fad28acce4d04d329dafc41 to your computer and use it in GitHub Desktop.
Created on Skills Network Labs
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h1>\n",
" Capstone Project</h1>\n",
" <h3 > - provided by Coursera</h3>\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This project shall see how to use location API to find a new location to live with good neighbours"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Hello Capstone Project Course!\n"
]
}
],
"source": [
"print (\" Hello Capstone Project Course!\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Explore and cluster the neighborhoods in Toronto"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"data_url=\"https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M\""
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting BeautifulSoup4\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/66/25/ff030e2437265616a1e9b25ccc864e0371a0bc3adb7c5a404fd661c6f4f6/beautifulsoup4-4.9.1-py3-none-any.whl (115kB)\n",
"\u001b[K |████████████████████████████████| 122kB 25.0MB/s eta 0:00:01\n",
"\u001b[?25hCollecting soupsieve>1.2 (from BeautifulSoup4)\n",
" Downloading https://files.pythonhosted.org/packages/6f/8f/457f4a5390eeae1cc3aeab89deb7724c965be841ffca6cfca9197482e470/soupsieve-2.0.1-py3-none-any.whl\n",
"Installing collected packages: soupsieve, BeautifulSoup4\n",
"Successfully installed BeautifulSoup4-4.9.1 soupsieve-2.0.1\n",
"Requirement already satisfied: requests in /home/jupyterlab/conda/envs/python/lib/python3.6/site-packages (2.23.0)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /home/jupyterlab/conda/envs/python/lib/python3.6/site-packages (from requests) (2020.4.5.1)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in /home/jupyterlab/conda/envs/python/lib/python3.6/site-packages (from requests) (3.0.4)\n",
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /home/jupyterlab/conda/envs/python/lib/python3.6/site-packages (from requests) (1.25.9)\n",
"Requirement already satisfied: idna<3,>=2.5 in /home/jupyterlab/conda/envs/python/lib/python3.6/site-packages (from requests) (2.9)\n"
]
}
],
"source": [
"#install Beautiful Soup and requests for Web Scaping\n",
"!pip install BeautifulSoup4\n",
"!pip install requests"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting package metadata (current_repodata.json): done\n",
"Solving environment: done\n",
"\n",
"# All requested packages already installed.\n",
"\n",
"Libraries imported.\n"
]
}
],
"source": [
"import numpy as np # library to handle data in a vectorized manner\n",
"\n",
"import pandas as pd # library for data analsysis\n",
"pd.set_option('display.max_columns', None)\n",
"pd.set_option('display.max_rows', None)\n",
"\n",
"import json # library to handle JSON files\n",
"\n",
"!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab\n",
"from geopy.geocoders import Nominatim # convert an address into latitude and longitude values\n",
"\n",
"import requests # library to handle requests\n",
"from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe\n",
"\n",
"# Matplotlib and associated plotting modules\n",
"import matplotlib.cm as cm\n",
"import matplotlib.colors as colors\n",
"\n",
"# import k-means from clustering stage\n",
"from sklearn.cluster import KMeans\n",
"\n",
"#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab\n",
"import folium # map rendering library\n",
"\n",
"print('Libraries imported.')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"#imports\n",
"from bs4 import BeautifulSoup\n",
"import requests\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"#get html from wiki page and create soup object\n",
"source = requests.get(data_url)\n",
"soup = BeautifulSoup(source.text, 'html')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"#using soup object, iterate the .wikitable to get the data from the HTML page and store it into a list\n",
"data = []\n",
"columns = []\n",
"table = soup.find(class_='wikitable')\n",
"for index, tr in enumerate(table.find_all('tr')):\n",
" section = []\n",
" for td in tr.find_all(['th','td']):\n",
" section.append(td.text.rstrip())\n",
" \n",
" #First row of data is the header\n",
" if (index == 0):\n",
" columns = section\n",
" else:\n",
" data.append(section)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Postal Code</th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>M1A</td>\n",
" <td>Not assigned</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>M2A</td>\n",
" <td>Not assigned</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M3A</td>\n",
" <td>North York</td>\n",
" <td>Parkwoods</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M4A</td>\n",
" <td>North York</td>\n",
" <td>Victoria Village</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>M5A</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Regent Park, Harbourfront</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Postal Code Borough Neighborhood\n",
"0 M1A Not assigned \n",
"1 M2A Not assigned \n",
"2 M3A North York Parkwoods\n",
"3 M4A North York Victoria Village\n",
"4 M5A Downtown Toronto Regent Park, Harbourfront"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#convert list into Pandas DataFrame\n",
"canada_df = pd.DataFrame(data = data,columns = columns)\n",
"canada_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Postal Code</th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M3A</td>\n",
" <td>North York</td>\n",
" <td>Parkwoods</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M4A</td>\n",
" <td>North York</td>\n",
" <td>Victoria Village</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>M5A</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Regent Park, Harbourfront</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>M6A</td>\n",
" <td>North York</td>\n",
" <td>Lawrence Manor, Lawrence Heights</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>M7A</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Queen's Park, Ontario Provincial Government</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Postal Code Borough Neighborhood\n",
"2 M3A North York Parkwoods\n",
"3 M4A North York Victoria Village\n",
"4 M5A Downtown Toronto Regent Park, Harbourfront\n",
"5 M6A North York Lawrence Manor, Lawrence Heights\n",
"6 M7A Downtown Toronto Queen's Park, Ontario Provincial Government"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Remove Boroughs that are 'Not assigned'\n",
"canada_df = canada_df[canada_df['Borough'] != 'Not assigned']\n",
"canada_df.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Postal Code</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>M3A</th>\n",
" <td>North York</td>\n",
" <td>Parkwoods</td>\n",
" </tr>\n",
" <tr>\n",
" <th>M4A</th>\n",
" <td>North York</td>\n",
" <td>Victoria Village</td>\n",
" </tr>\n",
" <tr>\n",
" <th>M5A</th>\n",
" <td>Downtown Toronto</td>\n",
" <td>Regent Park, Harbourfront</td>\n",
" </tr>\n",
" <tr>\n",
" <th>M6A</th>\n",
" <td>North York</td>\n",
" <td>Lawrence Manor, Lawrence Heights</td>\n",
" </tr>\n",
" <tr>\n",
" <th>M7A</th>\n",
" <td>Downtown Toronto</td>\n",
" <td>Queen's Park, Ontario Provincial Government</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Borough Neighborhood\n",
"Postal Code \n",
"M3A North York Parkwoods\n",
"M4A North York Victoria Village\n",
"M5A Downtown Toronto Regent Park, Harbourfront\n",
"M6A North York Lawrence Manor, Lawrence Heights\n",
"M7A Downtown Toronto Queen's Park, Ontario Provincial Government"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# More than one neighborhood can exist in one postal code area, combined these into one row with the neighborhoods separated with a comma\n",
"canada_df[\"Neighborhood\"] = canada_df.groupby(\"Postal Code\")[\"Neighborhood\"].transform(lambda neigh: ', '.join(neigh))\n",
"\n",
"#remove duplicates\n",
"canada_df = canada_df.drop_duplicates()\n",
"\n",
"#update index to be postcode if it isn't already\n",
"if(canada_df.index.name != 'Postal Code'):\n",
" canada_df = canada_df.set_index('Postal Code')\n",
" \n",
"canada_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Postal Code</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>M3A</th>\n",
" <td>North York</td>\n",
" <td>Parkwoods</td>\n",
" </tr>\n",
" <tr>\n",
" <th>M4A</th>\n",
" <td>North York</td>\n",
" <td>Victoria Village</td>\n",
" </tr>\n",
" <tr>\n",
" <th>M5A</th>\n",
" <td>Downtown Toronto</td>\n",
" <td>Regent Park, Harbourfront</td>\n",
" </tr>\n",
" <tr>\n",
" <th>M6A</th>\n",
" <td>North York</td>\n",
" <td>Lawrence Manor, Lawrence Heights</td>\n",
" </tr>\n",
" <tr>\n",
" <th>M7A</th>\n",
" <td>Downtown Toronto</td>\n",
" <td>Queen's Park, Ontario Provincial Government</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Borough Neighborhood\n",
"Postal Code \n",
"M3A North York Parkwoods\n",
"M4A North York Victoria Village\n",
"M5A Downtown Toronto Regent Park, Harbourfront\n",
"M6A North York Lawrence Manor, Lawrence Heights\n",
"M7A Downtown Toronto Queen's Park, Ontario Provincial Government"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough\n",
"canada_df['Neighborhood'].replace(\"Not assigned\", canada_df[\"Borough\"],inplace=True)\n",
"canada_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(103, 2)"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"canada_df.shape\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python",
"language": "python",
"name": "conda-env-python-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment