Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save namdoan194/5a7be1d16df9c64e1faaa4a62e0a4eda to your computer and use it in GitHub Desktop.
Save namdoan194/5a7be1d16df9c64e1faaa4a62e0a4eda to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "**Notebook Created**"
},
{
"metadata": {},
"cell_type": "code",
"source": "import numpy as np # library to handle data in a vectorized manner\nimport pandas as pd # library for data analsysis\nimport requests # Library for web scraping\n\nprint('Libraries imported.')",
"execution_count": 105,
"outputs": [
{
"output_type": "stream",
"text": "Libraries imported.\n",
"name": "stdout"
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "**Wrangling Website**"
},
{
"metadata": {},
"cell_type": "code",
"source": "import requests\nfrom urllib.request import urlopen\nfrom bs4 import BeautifulSoup\nimport ssl\nimport csv\n\nprint('BeautifulSoup & csv imported.')",
"execution_count": 106,
"outputs": [
{
"output_type": "stream",
"text": "BeautifulSoup & csv imported.\n",
"name": "stdout"
}
]
},
{
"metadata": {},
"cell_type": "code",
"source": "ctx = ssl.create_default_context()\nctx.check_hostname = False\nctx.verify_mode = ssl.CERT_NONE\n\nprint('SSL certificate errors ignored.')",
"execution_count": 107,
"outputs": [
{
"output_type": "stream",
"text": "SSL certificate errors ignored.\n",
"name": "stdout"
}
]
},
{
"metadata": {},
"cell_type": "code",
"source": "source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text\n\nsoup = BeautifulSoup(source, 'lxml')\n",
"execution_count": 141,
"outputs": []
},
{
"metadata": {},
"cell_type": "code",
"source": "table_rows = table.find_all('tr')\ntable = soup.find('table',{'class':'wikitable sortable'})\n\ndata = []\nfor row in table_rows:\n data.append([t.text.strip() for t in row.find_all('td')])\n\ndf = pd.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])\ndf = df[~df['PostalCode'].isnull()] ",
"execution_count": 142,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "**Data Transformed into pandas DataFrame**"
},
{
"metadata": {},
"cell_type": "code",
"source": "df.info()",
"execution_count": 143,
"outputs": [
{
"output_type": "stream",
"text": "<class 'pandas.core.frame.DataFrame'>\nInt64Index: 180 entries, 1 to 180\nData columns (total 3 columns):\nPostalCode 180 non-null object\nBorough 180 non-null object\nNeighbourhood 180 non-null object\ndtypes: object(3)\nmemory usage: 5.6+ KB\n",
"name": "stdout"
}
]
},
{
"metadata": {},
"cell_type": "code",
"source": "df.shape",
"execution_count": 144,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 144,
"data": {
"text/plain": "(180, 3)"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
},
{
"metadata": {},
"cell_type": "code",
"source": "import pandas\nimport requests\nfrom bs4 import BeautifulSoup\nwebsite_text = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text\nsoup = BeautifulSoup(website_text,'lxml')\n\ntable = soup.find('table',{'class':'wikitable sortable'})\ntable_rows = table.find_all('tr')\n\ndata = []\nfor row in table_rows:\n data.append([t.text.strip() for t in row.find_all('td')])\n\ndf = pandas.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])\ndf = df[~df['PostalCode'].isnull()] # to filter out bad rows\n",
"execution_count": 145,
"outputs": []
},
{
"metadata": {},
"cell_type": "code",
"source": "df1 = df.reset_index()",
"execution_count": 146,
"outputs": []
},
{
"metadata": {},
"cell_type": "code",
"source": "df2= df1.groupby('PostalCode').agg(lambda x: ','.join(x))",
"execution_count": 147,
"outputs": []
},
{
"metadata": {},
"cell_type": "code",
"source": "df2.loc[df2['Neighbourhood']==\"Not assigned\",'Neighbourhood']=df2.loc[df2['Neighbourhood']==\"Not assigned\",'Borough']\n\n#df2.head()",
"execution_count": 148,
"outputs": []
},
{
"metadata": {},
"cell_type": "code",
"source": "df2.loc[df2['Neighbourhood']==\"Not assigned\",'Neighbourhood']=df2.loc[df2['Neighbourhood']==\"Not assigned\",'Borough']\n",
"execution_count": 149,
"outputs": []
},
{
"metadata": {},
"cell_type": "code",
"source": "df3 = df2.reset_index()",
"execution_count": 150,
"outputs": []
},
{
"metadata": {},
"cell_type": "code",
"source": "df3['Borough']= df3['Borough'].str.replace('nan|[{}\\s]','').str.split(',').apply(set).str.join(',').str.strip(',').str.replace(\",{2,}\",\",\")",
"execution_count": 151,
"outputs": []
},
{
"metadata": {},
"cell_type": "code",
"source": "df3.head()",
"execution_count": 152,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 152,
"data": {
"text/plain": " PostalCode Borough Neighbourhood\n0 M1A Notassigned Not assigned\n1 M1B Scarborough Malvern, Rouge\n2 M1C Scarborough Rouge Hill, Port Union, Highland Creek\n3 M1E Scarborough Guildwood, Morningside, West Hill\n4 M1G Scarborough Woburn",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>PostalCode</th>\n <th>Borough</th>\n <th>Neighbourhood</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>M1A</td>\n <td>Notassigned</td>\n <td>Not assigned</td>\n </tr>\n <tr>\n <th>1</th>\n <td>M1B</td>\n <td>Scarborough</td>\n <td>Malvern, Rouge</td>\n </tr>\n <tr>\n <th>2</th>\n <td>M1C</td>\n <td>Scarborough</td>\n <td>Rouge Hill, Port Union, Highland Creek</td>\n </tr>\n <tr>\n <th>3</th>\n <td>M1E</td>\n <td>Scarborough</td>\n <td>Guildwood, Morningside, West Hill</td>\n </tr>\n <tr>\n <th>4</th>\n <td>M1G</td>\n <td>Scarborough</td>\n <td>Woburn</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "code",
"source": "df4 = df3[df3.Neighbourhood != 'Not assigned']",
"execution_count": 154,
"outputs": []
},
{
"metadata": {},
"cell_type": "code",
"source": "df4.head(10)",
"execution_count": 156,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 156,
"data": {
"text/plain": " PostalCode Borough Neighbourhood\n1 M1B Scarborough Malvern, Rouge\n2 M1C Scarborough Rouge Hill, Port Union, Highland Creek\n3 M1E Scarborough Guildwood, Morningside, West Hill\n4 M1G Scarborough Woburn\n5 M1H Scarborough Cedarbrae\n6 M1J Scarborough Scarborough Village\n7 M1K Scarborough Kennedy Park, Ionview, East Birchmount Park\n8 M1L Scarborough Golden Mile, Clairlea, Oakridge\n9 M1M Scarborough Cliffside, Cliffcrest, Scarborough Village West\n10 M1N Scarborough Birch Cliff, Cliffside West",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>PostalCode</th>\n <th>Borough</th>\n <th>Neighbourhood</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1</th>\n <td>M1B</td>\n <td>Scarborough</td>\n <td>Malvern, Rouge</td>\n </tr>\n <tr>\n <th>2</th>\n <td>M1C</td>\n <td>Scarborough</td>\n <td>Rouge Hill, Port Union, Highland Creek</td>\n </tr>\n <tr>\n <th>3</th>\n <td>M1E</td>\n <td>Scarborough</td>\n <td>Guildwood, Morningside, West Hill</td>\n </tr>\n <tr>\n <th>4</th>\n <td>M1G</td>\n <td>Scarborough</td>\n <td>Woburn</td>\n </tr>\n <tr>\n <th>5</th>\n <td>M1H</td>\n <td>Scarborough</td>\n <td>Cedarbrae</td>\n </tr>\n <tr>\n <th>6</th>\n <td>M1J</td>\n <td>Scarborough</td>\n <td>Scarborough Village</td>\n </tr>\n <tr>\n <th>7</th>\n <td>M1K</td>\n <td>Scarborough</td>\n <td>Kennedy Park, Ionview, East Birchmount Park</td>\n </tr>\n <tr>\n <th>8</th>\n <td>M1L</td>\n <td>Scarborough</td>\n <td>Golden Mile, Clairlea, Oakridge</td>\n </tr>\n <tr>\n <th>9</th>\n <td>M1M</td>\n <td>Scarborough</td>\n <td>Cliffside, Cliffcrest, Scarborough Village West</td>\n </tr>\n <tr>\n <th>10</th>\n <td>M1N</td>\n <td>Scarborough</td>\n <td>Birch Cliff, Cliffside West</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3.6",
"language": "python"
},
"language_info": {
"name": "python",
"version": "3.6.9",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment