Skip to content

Instantly share code, notes, and snippets.

@egy1st
Created February 23, 2022 22:36
Show Gist options
  • Save egy1st/b8383f28411e3e9851535b7ae90a813e to your computer and use it in GitHub Desktop.
Save egy1st/b8383f28411e3e9851535b7ae90a813e to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tornoto Neighborhood Scrape"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"# html_doc = \"https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M\"\n"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"# import pandas library to work with dataframe\n",
"import pandas as pd\n",
"import numpy as np\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: BeautifulSoup4 in /home/jupyterlab/conda/lib/python3.6/site-packages (4.6.3)\n",
"Requirement already satisfied: lxml in /home/jupyterlab/conda/lib/python3.6/site-packages (4.2.5)\n",
"Requirement already satisfied: html5lib in /home/jupyterlab/conda/lib/python3.6/site-packages (0.9999999)\n"
]
}
],
"source": [
"# installing beautifulsoup4 package for scraping wikipedia page\n",
"# also install proper needed parsers to work with beautifulsoup4\n",
"\n",
"!pip install BeautifulSoup4\n",
"!pip install lxml\n",
"!pip install html5lib"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import bs4\n",
"import requests # library to handle requests\n",
"import json # library to handle JSON files\n",
"from bs4 import BeautifulSoup"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# URL on wikipedia to scrap, thus we can extract postal codes, Borough and Neighbrhood \n",
"url = \"https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M\"\n",
"html_doc = requests.get(url).text"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"soup = BeautifulSoup(html_doc, 'html.parser')"
]
},
{
"cell_type": "code",
"execution_count": 150,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Our dataframe now is 103 rows\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PostalCode</th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" <th>Latitude</th>\n",
" <th>Longitude</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>M3A</td>\n",
" <td>North York</td>\n",
" <td>Parkwoods</td>\n",
" <td>43.753259</td>\n",
" <td>-79.329656</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>M4A</td>\n",
" <td>North York</td>\n",
" <td>Victoria Village</td>\n",
" <td>43.725882</td>\n",
" <td>-79.315572</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M5A</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Harbourfront, Regent Park</td>\n",
" <td>43.654260</td>\n",
" <td>-79.360636</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M6A</td>\n",
" <td>North York</td>\n",
" <td>Lawrence Heights, Lawrence Manor</td>\n",
" <td>43.718518</td>\n",
" <td>-79.464763</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>M7A</td>\n",
" <td>Queen's Park</td>\n",
" <td>Queen's Park</td>\n",
" <td>43.662301</td>\n",
" <td>-79.389494</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>M9A</td>\n",
" <td>Etobicoke</td>\n",
" <td>Islington Avenue</td>\n",
" <td>43.667856</td>\n",
" <td>-79.532242</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>M1B</td>\n",
" <td>Scarborough</td>\n",
" <td>Rouge, Malvern</td>\n",
" <td>43.806686</td>\n",
" <td>-79.194353</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>M3B</td>\n",
" <td>North York</td>\n",
" <td>Don Mills North</td>\n",
" <td>43.745906</td>\n",
" <td>-79.352188</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>M4B</td>\n",
" <td>East York</td>\n",
" <td>Woodbine Gardens, Parkview Hill</td>\n",
" <td>43.706397</td>\n",
" <td>-79.309937</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>M5B</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Ryerson, Garden District</td>\n",
" <td>43.657162</td>\n",
" <td>-79.378937</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>M6B</td>\n",
" <td>North York</td>\n",
" <td>Glencairn</td>\n",
" <td>43.709577</td>\n",
" <td>-79.445073</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>M9B</td>\n",
" <td>Etobicoke</td>\n",
" <td>Princess Gardens, West Deane Park</td>\n",
" <td>43.650943</td>\n",
" <td>-79.554724</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>M1C</td>\n",
" <td>Scarborough</td>\n",
" <td>Rouge Hill, Port Union</td>\n",
" <td>43.784535</td>\n",
" <td>-79.160497</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>M3C</td>\n",
" <td>North York</td>\n",
" <td>Flemingdon Park, Don Mills South</td>\n",
" <td>43.725900</td>\n",
" <td>-79.340923</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>M4C</td>\n",
" <td>East York</td>\n",
" <td>Woodbine Heights</td>\n",
" <td>43.695344</td>\n",
" <td>-79.318389</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>M5C</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>St. James Town</td>\n",
" <td>43.651494</td>\n",
" <td>-79.375418</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>M6C</td>\n",
" <td>York</td>\n",
" <td>Humewood-Cedarvale</td>\n",
" <td>43.693781</td>\n",
" <td>-79.428191</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>M9C</td>\n",
" <td>Etobicoke</td>\n",
" <td>Markland Wood, Old Burnhamthorpe</td>\n",
" <td>43.643515</td>\n",
" <td>-79.577201</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>M1E</td>\n",
" <td>Scarborough</td>\n",
" <td>Morningside, West Hill</td>\n",
" <td>43.763573</td>\n",
" <td>-79.188711</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>M4E</td>\n",
" <td>East Toronto</td>\n",
" <td>The Beaches</td>\n",
" <td>43.676357</td>\n",
" <td>-79.293031</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>M5E</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Berczy Park</td>\n",
" <td>43.644771</td>\n",
" <td>-79.373306</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>M6E</td>\n",
" <td>York</td>\n",
" <td>Caledonia-Fairbanks</td>\n",
" <td>43.689026</td>\n",
" <td>-79.453512</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>M1G</td>\n",
" <td>Scarborough</td>\n",
" <td>Woburn</td>\n",
" <td>43.770992</td>\n",
" <td>-79.216917</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>M4G</td>\n",
" <td>East York</td>\n",
" <td>Leaside</td>\n",
" <td>43.709060</td>\n",
" <td>-79.363452</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>M5G</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Central Bay Street</td>\n",
" <td>43.657952</td>\n",
" <td>-79.387383</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>M6G</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Christie</td>\n",
" <td>43.669542</td>\n",
" <td>-79.422564</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>M1H</td>\n",
" <td>Scarborough</td>\n",
" <td>Cedarbrae</td>\n",
" <td>43.773136</td>\n",
" <td>-79.239476</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>M2H</td>\n",
" <td>North York</td>\n",
" <td>Hillcrest Village</td>\n",
" <td>43.803762</td>\n",
" <td>-79.363452</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>M3H</td>\n",
" <td>North York</td>\n",
" <td>Downsview North, Wilson Heights</td>\n",
" <td>43.754328</td>\n",
" <td>-79.442259</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>M4H</td>\n",
" <td>East York</td>\n",
" <td>Thorncliffe Park</td>\n",
" <td>43.705369</td>\n",
" <td>-79.349372</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73</th>\n",
" <td>M4R</td>\n",
" <td>Central Toronto</td>\n",
" <td>North Toronto West</td>\n",
" <td>43.715383</td>\n",
" <td>-79.405678</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74</th>\n",
" <td>M5R</td>\n",
" <td>Central Toronto</td>\n",
" <td>North Midtown, Yorkville</td>\n",
" <td>43.672710</td>\n",
" <td>-79.405678</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75</th>\n",
" <td>M6R</td>\n",
" <td>West Toronto</td>\n",
" <td>Parkdale, Roncesvalles</td>\n",
" <td>43.648960</td>\n",
" <td>-79.456325</td>\n",
" </tr>\n",
" <tr>\n",
" <th>76</th>\n",
" <td>M7R</td>\n",
" <td>Mississauga</td>\n",
" <td>Canada Post Gateway Processing Centre</td>\n",
" <td>43.636966</td>\n",
" <td>-79.615819</td>\n",
" </tr>\n",
" <tr>\n",
" <th>77</th>\n",
" <td>M9R</td>\n",
" <td>Etobicoke</td>\n",
" <td>Richview Gardens, St. Phillips</td>\n",
" <td>43.688905</td>\n",
" <td>-79.554724</td>\n",
" </tr>\n",
" <tr>\n",
" <th>78</th>\n",
" <td>M1S</td>\n",
" <td>Scarborough</td>\n",
" <td>Agincourt</td>\n",
" <td>43.794200</td>\n",
" <td>-79.262029</td>\n",
" </tr>\n",
" <tr>\n",
" <th>79</th>\n",
" <td>M4S</td>\n",
" <td>Central Toronto</td>\n",
" <td>Davisville</td>\n",
" <td>43.704324</td>\n",
" <td>-79.388790</td>\n",
" </tr>\n",
" <tr>\n",
" <th>80</th>\n",
" <td>M5S</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Harbord, University of Toronto</td>\n",
" <td>43.662696</td>\n",
" <td>-79.400049</td>\n",
" </tr>\n",
" <tr>\n",
" <th>81</th>\n",
" <td>M6S</td>\n",
" <td>West Toronto</td>\n",
" <td>Runnymede, Swansea</td>\n",
" <td>43.651571</td>\n",
" <td>-79.484450</td>\n",
" </tr>\n",
" <tr>\n",
" <th>82</th>\n",
" <td>M1T</td>\n",
" <td>Scarborough</td>\n",
" <td>Sullivan, Tam O'Shanter</td>\n",
" <td>43.781638</td>\n",
" <td>-79.304302</td>\n",
" </tr>\n",
" <tr>\n",
" <th>83</th>\n",
" <td>M4T</td>\n",
" <td>Central Toronto</td>\n",
" <td>Moore Park, Summerhill East</td>\n",
" <td>43.689574</td>\n",
" <td>-79.383160</td>\n",
" </tr>\n",
" <tr>\n",
" <th>84</th>\n",
" <td>M5T</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Grange Park, Kensington Market</td>\n",
" <td>43.653206</td>\n",
" <td>-79.400049</td>\n",
" </tr>\n",
" <tr>\n",
" <th>85</th>\n",
" <td>M1V</td>\n",
" <td>Scarborough</td>\n",
" <td>Milliken, Steeles East</td>\n",
" <td>43.815252</td>\n",
" <td>-79.284577</td>\n",
" </tr>\n",
" <tr>\n",
" <th>86</th>\n",
" <td>M4V</td>\n",
" <td>Central Toronto</td>\n",
" <td>South Hill, Summerhill West</td>\n",
" <td>43.686412</td>\n",
" <td>-79.400049</td>\n",
" </tr>\n",
" <tr>\n",
" <th>87</th>\n",
" <td>M5V</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Railway Lands, South Niagara</td>\n",
" <td>43.628947</td>\n",
" <td>-79.394420</td>\n",
" </tr>\n",
" <tr>\n",
" <th>88</th>\n",
" <td>M8V</td>\n",
" <td>Etobicoke</td>\n",
" <td>Mimico South, New Toronto</td>\n",
" <td>43.605647</td>\n",
" <td>-79.501321</td>\n",
" </tr>\n",
" <tr>\n",
" <th>89</th>\n",
" <td>M9V</td>\n",
" <td>Etobicoke</td>\n",
" <td>South Steeles, Thistletown</td>\n",
" <td>43.739416</td>\n",
" <td>-79.588437</td>\n",
" </tr>\n",
" <tr>\n",
" <th>90</th>\n",
" <td>M1W</td>\n",
" <td>Scarborough</td>\n",
" <td>L'Amoreaux West, Steeles West</td>\n",
" <td>43.799525</td>\n",
" <td>-79.318389</td>\n",
" </tr>\n",
" <tr>\n",
" <th>91</th>\n",
" <td>M4W</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Rosedale</td>\n",
" <td>43.679563</td>\n",
" <td>-79.377529</td>\n",
" </tr>\n",
" <tr>\n",
" <th>92</th>\n",
" <td>M5W</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Stn A PO Boxes 25 The Esplanade</td>\n",
" <td>43.646435</td>\n",
" <td>-79.374846</td>\n",
" </tr>\n",
" <tr>\n",
" <th>93</th>\n",
" <td>M8W</td>\n",
" <td>Etobicoke</td>\n",
" <td>Alderwood, Long Branch</td>\n",
" <td>43.602414</td>\n",
" <td>-79.543484</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94</th>\n",
" <td>M9W</td>\n",
" <td>Etobicoke</td>\n",
" <td>Northwest</td>\n",
" <td>43.706748</td>\n",
" <td>-79.594054</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>M1X</td>\n",
" <td>Scarborough</td>\n",
" <td>Upper Rouge</td>\n",
" <td>43.836125</td>\n",
" <td>-79.205636</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>M4X</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Cabbagetown, St. James Town</td>\n",
" <td>43.667967</td>\n",
" <td>-79.367675</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>M5X</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>First Canadian Place, Underground city</td>\n",
" <td>43.648429</td>\n",
" <td>-79.382280</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>M8X</td>\n",
" <td>Etobicoke</td>\n",
" <td>Montgomery Road, Old Mill North</td>\n",
" <td>43.653654</td>\n",
" <td>-79.506944</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>M4Y</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Church and Wellesley</td>\n",
" <td>43.665860</td>\n",
" <td>-79.383160</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100</th>\n",
" <td>M7Y</td>\n",
" <td>East Toronto</td>\n",
" <td>Business Reply Mail Processing Centre 969 Eastern</td>\n",
" <td>43.662744</td>\n",
" <td>-79.321558</td>\n",
" </tr>\n",
" <tr>\n",
" <th>101</th>\n",
" <td>M8Y</td>\n",
" <td>Etobicoke</td>\n",
" <td>Royal York South East, Sunnylea</td>\n",
" <td>43.636258</td>\n",
" <td>-79.498509</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>M8Z</td>\n",
" <td>Etobicoke</td>\n",
" <td>Royal York South West, South of Bloor</td>\n",
" <td>43.628841</td>\n",
" <td>-79.520999</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>103 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" PostalCode Borough \\\n",
"0 M3A North York \n",
"1 M4A North York \n",
"2 M5A Downtown Toronto \n",
"3 M6A North York \n",
"4 M7A Queen's Park \n",
"5 M9A Etobicoke \n",
"6 M1B Scarborough \n",
"7 M3B North York \n",
"8 M4B East York \n",
"9 M5B Downtown Toronto \n",
"10 M6B North York \n",
"11 M9B Etobicoke \n",
"12 M1C Scarborough \n",
"13 M3C North York \n",
"14 M4C East York \n",
"15 M5C Downtown Toronto \n",
"16 M6C York \n",
"17 M9C Etobicoke \n",
"18 M1E Scarborough \n",
"19 M4E East Toronto \n",
"20 M5E Downtown Toronto \n",
"21 M6E York \n",
"22 M1G Scarborough \n",
"23 M4G East York \n",
"24 M5G Downtown Toronto \n",
"25 M6G Downtown Toronto \n",
"26 M1H Scarborough \n",
"27 M2H North York \n",
"28 M3H North York \n",
"29 M4H East York \n",
".. ... ... \n",
"73 M4R Central Toronto \n",
"74 M5R Central Toronto \n",
"75 M6R West Toronto \n",
"76 M7R Mississauga \n",
"77 M9R Etobicoke \n",
"78 M1S Scarborough \n",
"79 M4S Central Toronto \n",
"80 M5S Downtown Toronto \n",
"81 M6S West Toronto \n",
"82 M1T Scarborough \n",
"83 M4T Central Toronto \n",
"84 M5T Downtown Toronto \n",
"85 M1V Scarborough \n",
"86 M4V Central Toronto \n",
"87 M5V Downtown Toronto \n",
"88 M8V Etobicoke \n",
"89 M9V Etobicoke \n",
"90 M1W Scarborough \n",
"91 M4W Downtown Toronto \n",
"92 M5W Downtown Toronto \n",
"93 M8W Etobicoke \n",
"94 M9W Etobicoke \n",
"95 M1X Scarborough \n",
"96 M4X Downtown Toronto \n",
"97 M5X Downtown Toronto \n",
"98 M8X Etobicoke \n",
"99 M4Y Downtown Toronto \n",
"100 M7Y East Toronto \n",
"101 M8Y Etobicoke \n",
"102 M8Z Etobicoke \n",
"\n",
" Neighborhood Latitude Longitude \n",
"0 Parkwoods 43.753259 -79.329656 \n",
"1 Victoria Village 43.725882 -79.315572 \n",
"2 Harbourfront, Regent Park 43.654260 -79.360636 \n",
"3 Lawrence Heights, Lawrence Manor 43.718518 -79.464763 \n",
"4 Queen's Park 43.662301 -79.389494 \n",
"5 Islington Avenue 43.667856 -79.532242 \n",
"6 Rouge, Malvern 43.806686 -79.194353 \n",
"7 Don Mills North 43.745906 -79.352188 \n",
"8 Woodbine Gardens, Parkview Hill 43.706397 -79.309937 \n",
"9 Ryerson, Garden District 43.657162 -79.378937 \n",
"10 Glencairn 43.709577 -79.445073 \n",
"11 Princess Gardens, West Deane Park 43.650943 -79.554724 \n",
"12 Rouge Hill, Port Union 43.784535 -79.160497 \n",
"13 Flemingdon Park, Don Mills South 43.725900 -79.340923 \n",
"14 Woodbine Heights 43.695344 -79.318389 \n",
"15 St. James Town 43.651494 -79.375418 \n",
"16 Humewood-Cedarvale 43.693781 -79.428191 \n",
"17 Markland Wood, Old Burnhamthorpe 43.643515 -79.577201 \n",
"18 Morningside, West Hill 43.763573 -79.188711 \n",
"19 The Beaches 43.676357 -79.293031 \n",
"20 Berczy Park 43.644771 -79.373306 \n",
"21 Caledonia-Fairbanks 43.689026 -79.453512 \n",
"22 Woburn 43.770992 -79.216917 \n",
"23 Leaside 43.709060 -79.363452 \n",
"24 Central Bay Street 43.657952 -79.387383 \n",
"25 Christie 43.669542 -79.422564 \n",
"26 Cedarbrae 43.773136 -79.239476 \n",
"27 Hillcrest Village 43.803762 -79.363452 \n",
"28 Downsview North, Wilson Heights 43.754328 -79.442259 \n",
"29 Thorncliffe Park 43.705369 -79.349372 \n",
".. ... ... ... \n",
"73 North Toronto West 43.715383 -79.405678 \n",
"74 North Midtown, Yorkville 43.672710 -79.405678 \n",
"75 Parkdale, Roncesvalles 43.648960 -79.456325 \n",
"76 Canada Post Gateway Processing Centre 43.636966 -79.615819 \n",
"77 Richview Gardens, St. Phillips 43.688905 -79.554724 \n",
"78 Agincourt 43.794200 -79.262029 \n",
"79 Davisville 43.704324 -79.388790 \n",
"80 Harbord, University of Toronto 43.662696 -79.400049 \n",
"81 Runnymede, Swansea 43.651571 -79.484450 \n",
"82 Sullivan, Tam O'Shanter 43.781638 -79.304302 \n",
"83 Moore Park, Summerhill East 43.689574 -79.383160 \n",
"84 Grange Park, Kensington Market 43.653206 -79.400049 \n",
"85 Milliken, Steeles East 43.815252 -79.284577 \n",
"86 South Hill, Summerhill West 43.686412 -79.400049 \n",
"87 Railway Lands, South Niagara 43.628947 -79.394420 \n",
"88 Mimico South, New Toronto 43.605647 -79.501321 \n",
"89 South Steeles, Thistletown 43.739416 -79.588437 \n",
"90 L'Amoreaux West, Steeles West 43.799525 -79.318389 \n",
"91 Rosedale 43.679563 -79.377529 \n",
"92 Stn A PO Boxes 25 The Esplanade 43.646435 -79.374846 \n",
"93 Alderwood, Long Branch 43.602414 -79.543484 \n",
"94 Northwest 43.706748 -79.594054 \n",
"95 Upper Rouge 43.836125 -79.205636 \n",
"96 Cabbagetown, St. James Town 43.667967 -79.367675 \n",
"97 First Canadian Place, Underground city 43.648429 -79.382280 \n",
"98 Montgomery Road, Old Mill North 43.653654 -79.506944 \n",
"99 Church and Wellesley 43.665860 -79.383160 \n",
"100 Business Reply Mail Processing Centre 969 Eastern 43.662744 -79.321558 \n",
"101 Royal York South East, Sunnylea 43.636258 -79.498509 \n",
"102 Royal York South West, South of Bloor 43.628841 -79.520999 \n",
"\n",
"[103 rows x 5 columns]"
]
},
"execution_count": 150,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# extract the postal code table\n",
"postal_data = []\n",
"table = soup.find('table', attrs={'class':'wikitable sortable'})\n",
"table_body = table.find('tbody')\n",
"\n",
"rows = table_body.find_all('tr')\n",
"for row in rows:\n",
" cols = row.find_all('td')\n",
" cols = [ele.text.strip() for ele in cols]\n",
" postal_data.append([ele for ele in cols if ele]) # Get rid of empty values\n",
"\n",
"# the resulted is a list, so let us convert it to dataframe \n",
"df = pd.DataFrame(data=postal_data)\n",
"\n",
"# assign proper headers to columns\n",
"df.columns = ['PostalCode', 'Borough', 'Neighborhood']\n",
"\n",
"# Ignore cells with a borough that is Not assigned. \n",
"df_new = df [df.Borough != \"Not assigned\"]\n",
"\n",
"# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough\n",
"for index, row in df_new.iterrows():\n",
" if row[\"Neighborhood\"] == \"Not assigned\" :\n",
" row[\"Neighborhood\"] = row[\"Borough\"]\n",
"\n",
"# let us sort dataframe by column 'PostalCode', thus we can handle the next process\n",
"df_new.sort_values(by=['PostalCode'])\n",
"# delete first row conducted by parser as empty\n",
"df_new = df_new.iloc[1:]\n",
"\n",
"\n",
"# More than one neighborhood can exist in one postal code area\n",
"# let us comine these neighbors into one row\n",
"df_unique = df_new.PostalCode.unique()\n",
"dictOfPostal = dict.fromkeys(df_unique , '')\n",
"\n",
"\n",
"# read the latitude and longtude from csv file\n",
"geocsv = pd.read_csv(\"Geospatial_Coordinates.csv\")\n",
"\n",
"# create dictioanry for latitude property\n",
"dic_latitude = {}\n",
"geo_latitude = geocsv.iloc [:, 0:2]\n",
"for index, row in geo_latitude.iterrows():\n",
" dic_latitude[row['Postal_Code']] = row['Latitude']\n",
"\n",
"# create dictioanry for longtitude property\n",
"dic_longitude = {}\n",
"geo_longitude = geocsv.iloc [:, [0,2]]\n",
"for index, row in geo_longitude.iterrows():\n",
" dic_longitude[row['Postal_Code']] = row['Longitude']\n",
"\n",
"# the main concatenation of joining latitude & longtide from dictionaris with other properties from dataframe\n",
"old_postal_code = ''\n",
"old_neighborhood = ''\n",
"new_neighborhood = ''\n",
"for index, row in df_new.iterrows():\n",
" \n",
" if old_postal_code != row['PostalCode'] :\n",
" old_postal_code = old_postal_code\n",
" dictOfPostal[row['PostalCode']] = [ row['Borough'], row['Neighborhood'], dic_latitude[row['PostalCode']], dic_longitude[row['PostalCode']] ]\n",
" elif old_postal_code == row['PostalCode'] and old_postal_code != '':\n",
" new_neighborhood = old_neighborhood + ', ' + row['Neighborhood']\n",
" dictOfPostal[old_postal_code] = [ row['Borough'], new_neighborhood, dic_latitude[old_postal_code], dic_longitude[old_postal_code] ]\n",
" old_postal_code = row['PostalCode']\n",
" old_neighborhood = row['Neighborhood']\n",
"\n",
"print ('Our dataframe now is ' + str (df_new.PostalCode.unique().shape[0]) + \" rows\")\n",
"\n",
"# finally assign each key to its values\n",
"dic_values_list = list(dictOfPostal.values())\n",
"dic_keys_list = list(dictOfPostal.keys())\n",
"dic_list = []\n",
"for i in range (0, len(dic_keys_list)):\n",
" dic_list.append( ( dic_keys_list[i], dic_values_list[i][0], dic_values_list[i][1], dic_values_list[i][2], dic_values_list[i][3] ))\n",
" \n",
" \n",
"# here we are convert list to dataframe and give it proper headers\n",
"df_results = pd.DataFrame(dic_list)\n",
"df_results.columns = ['PostalCode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude']\n",
"\n",
"#print results \n",
"df_results\n",
"\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment