Skip to content

Instantly share code, notes, and snippets.

@SonerYldrm
Created April 15, 2019 10:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save SonerYldrm/b306dd930ad45554e8012d02d688b5d7 to your computer and use it in GitHub Desktop.
Save SonerYldrm/b306dd930ad45554e8012d02d688b5d7 to your computer and use it in GitHub Desktop.
Created on Cognitive Class Labs
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import lxml.html as lh\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'\n",
"#Create a handle, page, to handle the contents of the website\n",
"page = requests.get(url)\n",
"#Store the contents of the website under doc\n",
"doc = lh.fromstring(page.content)\n",
"#Parse data that are stored between <tr>..</tr> of HTML\n",
"tr_elements = doc.xpath('//tr')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Check the length of the first 12 rows\n",
"[len(T) for T in tr_elements[:12]]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1:\"Postcode\"\n",
"2:\"Borough\"\n",
"3:\"Neighbourhood\n",
"\"\n"
]
}
],
"source": [
"tr_elements = doc.xpath('//tr')\n",
"#Create empty list\n",
"col=[]\n",
"i=0\n",
"#For each row, store each first element (header) and an empty list\n",
"for t in tr_elements[0]:\n",
" i+=1\n",
" name=t.text_content()\n",
" print('%d:\"%s\"'%(i,name))\n",
" col.append((name,[]))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"294"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(tr_elements)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"#Since out first row is the header, data is stored on the second row onwards\n",
"for j in range(1,len(tr_elements)):\n",
" #T is our j'th row\n",
" T=tr_elements[j]\n",
" \n",
" #If row is not of size 10, the //tr data is not from our table \n",
" if len(T)!=3:\n",
" break\n",
" \n",
" #i is the index of our column\n",
" i=0\n",
" \n",
" #Iterate through each element of the row\n",
" for t in T.iterchildren():\n",
" data=t.text_content() \n",
" #Check if row is empty\n",
"\n",
" #Append the data to the empty list of the i'th column\n",
" col[i][1].append(data)\n",
" #Increment i for the next column\n",
" i+=1"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[288, 288, 288]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[len(C) for (title,C) in col]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"Dict={title:column for (title,column) in col}\n",
"df=pd.DataFrame(Dict)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Postcode</th>\n",
" <th>Borough</th>\n",
" <th>Neighbourhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>M1A</td>\n",
" <td>Not assigned</td>\n",
" <td>Not assigned\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>M2A</td>\n",
" <td>Not assigned</td>\n",
" <td>Not assigned\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M3A</td>\n",
" <td>North York</td>\n",
" <td>Parkwoods\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M4A</td>\n",
" <td>North York</td>\n",
" <td>Victoria Village\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>M5A</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Harbourfront\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>M5A</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Regent Park\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>M6A</td>\n",
" <td>North York</td>\n",
" <td>Lawrence Heights\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>M6A</td>\n",
" <td>North York</td>\n",
" <td>Lawrence Manor\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>M7A</td>\n",
" <td>Queen's Park</td>\n",
" <td>Not assigned\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>M8A</td>\n",
" <td>Not assigned</td>\n",
" <td>Not assigned\\n</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Postcode Borough Neighbourhood\\n\n",
"0 M1A Not assigned Not assigned\\n\n",
"1 M2A Not assigned Not assigned\\n\n",
"2 M3A North York Parkwoods\\n\n",
"3 M4A North York Victoria Village\\n\n",
"4 M5A Downtown Toronto Harbourfront\\n\n",
"5 M5A Downtown Toronto Regent Park\\n\n",
"6 M6A North York Lawrence Heights\\n\n",
"7 M6A North York Lawrence Manor\\n\n",
"8 M7A Queen's Park Not assigned\\n\n",
"9 M8A Not assigned Not assigned\\n"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(288, 3)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# Ignore cells with a borough that is Not assigned.\n",
"df_1 = df[df['Borough'] != 'Not assigned'].reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>index</th>\n",
" <th>Postcode</th>\n",
" <th>Borough</th>\n",
" <th>Neighbourhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2</td>\n",
" <td>M3A</td>\n",
" <td>North York</td>\n",
" <td>Parkwoods\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>M4A</td>\n",
" <td>North York</td>\n",
" <td>Victoria Village\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4</td>\n",
" <td>M5A</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Harbourfront\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>5</td>\n",
" <td>M5A</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Regent Park\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>6</td>\n",
" <td>M6A</td>\n",
" <td>North York</td>\n",
" <td>Lawrence Heights\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>7</td>\n",
" <td>M6A</td>\n",
" <td>North York</td>\n",
" <td>Lawrence Manor\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>8</td>\n",
" <td>M7A</td>\n",
" <td>Queen's Park</td>\n",
" <td>Not assigned\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>10</td>\n",
" <td>M9A</td>\n",
" <td>Etobicoke</td>\n",
" <td>Islington Avenue\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>11</td>\n",
" <td>M1B</td>\n",
" <td>Scarborough</td>\n",
" <td>Rouge\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>12</td>\n",
" <td>M1B</td>\n",
" <td>Scarborough</td>\n",
" <td>Malvern\\n</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" index Postcode Borough Neighbourhood\\n\n",
"0 2 M3A North York Parkwoods\\n\n",
"1 3 M4A North York Victoria Village\\n\n",
"2 4 M5A Downtown Toronto Harbourfront\\n\n",
"3 5 M5A Downtown Toronto Regent Park\\n\n",
"4 6 M6A North York Lawrence Heights\\n\n",
"5 7 M6A North York Lawrence Manor\\n\n",
"6 8 M7A Queen's Park Not assigned\\n\n",
"7 10 M9A Etobicoke Islington Avenue\\n\n",
"8 11 M1B Scarborough Rouge\\n\n",
"9 12 M1B Scarborough Malvern\\n"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_1.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"103"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_1['Postcode'].unique().size"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"df_1.columns = ['Drop','Postcode', 'Borough', 'Neighborhood']"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Drop</th>\n",
" <th>Postcode</th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2</td>\n",
" <td>M3A</td>\n",
" <td>North York</td>\n",
" <td>Parkwoods\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>M4A</td>\n",
" <td>North York</td>\n",
" <td>Victoria Village\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4</td>\n",
" <td>M5A</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Harbourfront\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>5</td>\n",
" <td>M5A</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Regent Park\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>6</td>\n",
" <td>M6A</td>\n",
" <td>North York</td>\n",
" <td>Lawrence Heights\\n</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Drop Postcode Borough Neighborhood\n",
"0 2 M3A North York Parkwoods\\n\n",
"1 3 M4A North York Victoria Village\\n\n",
"2 4 M5A Downtown Toronto Harbourfront\\n\n",
"3 5 M5A Downtown Toronto Regent Park\\n\n",
"4 6 M6A North York Lawrence Heights\\n"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_1.head()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"df_2 = df_1[['Postcode','Borough','Neighborhood']]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Postcode</th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>M3A</td>\n",
" <td>North York</td>\n",
" <td>Parkwoods\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>M4A</td>\n",
" <td>North York</td>\n",
" <td>Victoria Village\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M5A</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Harbourfront\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M5A</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Regent Park\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>M6A</td>\n",
" <td>North York</td>\n",
" <td>Lawrence Heights\\n</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Postcode Borough Neighborhood\n",
"0 M3A North York Parkwoods\\n\n",
"1 M4A North York Victoria Village\\n\n",
"2 M5A Downtown Toronto Harbourfront\\n\n",
"3 M5A Downtown Toronto Regent Park\\n\n",
"4 M6A North York Lawrence Heights\\n"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_2.head()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"df_2 = df_2.groupby(['Postcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Postcode</th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>M1B</td>\n",
" <td>Scarborough</td>\n",
" <td>Rouge\\n, Malvern\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>M1C</td>\n",
" <td>Scarborough</td>\n",
" <td>Highland Creek\\n, Rouge Hill\\n, Port Union\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M1E</td>\n",
" <td>Scarborough</td>\n",
" <td>Guildwood\\n, Morningside\\n, West Hill\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M1G</td>\n",
" <td>Scarborough</td>\n",
" <td>Woburn\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>M1H</td>\n",
" <td>Scarborough</td>\n",
" <td>Cedarbrae\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>M1J</td>\n",
" <td>Scarborough</td>\n",
" <td>Scarborough Village\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>M1K</td>\n",
" <td>Scarborough</td>\n",
" <td>East Birchmount Park\\n, Ionview\\n, Kennedy Park\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>M1L</td>\n",
" <td>Scarborough</td>\n",
" <td>Clairlea\\n, Golden Mile\\n, Oakridge\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>M1M</td>\n",
" <td>Scarborough</td>\n",
" <td>Cliffcrest\\n, Cliffside\\n, Scarborough Village...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>M1N</td>\n",
" <td>Scarborough</td>\n",
" <td>Birch Cliff\\n, Cliffside West\\n</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Postcode Borough Neighborhood\n",
"0 M1B Scarborough Rouge\\n, Malvern\\n\n",
"1 M1C Scarborough Highland Creek\\n, Rouge Hill\\n, Port Union\\n\n",
"2 M1E Scarborough Guildwood\\n, Morningside\\n, West Hill\\n\n",
"3 M1G Scarborough Woburn\\n\n",
"4 M1H Scarborough Cedarbrae\\n\n",
"5 M1J Scarborough Scarborough Village\\n\n",
"6 M1K Scarborough East Birchmount Park\\n, Ionview\\n, Kennedy Park\\n\n",
"7 M1L Scarborough Clairlea\\n, Golden Mile\\n, Oakridge\\n\n",
"8 M1M Scarborough Cliffcrest\\n, Cliffside\\n, Scarborough Village...\n",
"9 M1N Scarborough Birch Cliff\\n, Cliffside West\\n"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_2.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(103, 3)"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_2.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment