Created
May 7, 2020 13:21
-
-
Save abhiray92/63b91475dfb7c1611a3b335bd4dc802f to your computer and use it in GitHub Desktop.
Created on Skills Network Labs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Collecting package metadata (current_repodata.json): done\n", | |
"Solving environment: done\n", | |
"\n", | |
"## Package Plan ##\n", | |
"\n", | |
" environment location: /home/jupyterlab/conda/envs/python\n", | |
"\n", | |
" added / updated specs:\n", | |
" - beautifulsoup4\n", | |
"\n", | |
"\n", | |
"The following packages will be downloaded:\n", | |
"\n", | |
" package | build\n", | |
" ---------------------------|-----------------\n", | |
" beautifulsoup4-4.9.0 | py36_0 167 KB\n", | |
" ca-certificates-2020.1.1 | 0 125 KB\n", | |
" certifi-2020.4.5.1 | py36_0 155 KB\n", | |
" openssl-1.1.1g | h7b6447c_0 2.5 MB\n", | |
" soupsieve-2.0 | py_0 33 KB\n", | |
" ------------------------------------------------------------\n", | |
" Total: 3.0 MB\n", | |
"\n", | |
"The following NEW packages will be INSTALLED:\n", | |
"\n", | |
" beautifulsoup4 pkgs/main/linux-64::beautifulsoup4-4.9.0-py36_0\n", | |
" soupsieve pkgs/main/noarch::soupsieve-2.0-py_0\n", | |
"\n", | |
"The following packages will be UPDATED:\n", | |
"\n", | |
" openssl conda-forge::openssl-1.1.1f-h516909a_0 --> pkgs/main::openssl-1.1.1g-h7b6447c_0\n", | |
"\n", | |
"The following packages will be SUPERSEDED by a higher-priority channel:\n", | |
"\n", | |
" ca-certificates conda-forge::ca-certificates-2020.4.5~ --> pkgs/main::ca-certificates-2020.1.1-0\n", | |
" certifi conda-forge::certifi-2020.4.5.1-py36h~ --> pkgs/main::certifi-2020.4.5.1-py36_0\n", | |
"\n", | |
"\n", | |
"\n", | |
"Downloading and Extracting Packages\n", | |
"ca-certificates-2020 | 125 KB | ##################################### | 100% \n", | |
"beautifulsoup4-4.9.0 | 167 KB | ##################################### | 100% \n", | |
"openssl-1.1.1g | 2.5 MB | ##################################### | 100% \n", | |
"soupsieve-2.0 | 33 KB | ##################################### | 100% \n", | |
"certifi-2020.4.5.1 | 155 KB | ##################################### | 100% \n", | |
"Preparing transaction: done\n", | |
"Verifying transaction: done\n", | |
"Executing transaction: done\n" | |
] | |
} | |
], | |
"source": [ | |
"!conda install beautifulsoup4 -y" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 71, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from bs4 import BeautifulSoup\n", | |
"import pandas as pd\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[<th>Postal Code\n", | |
" </th>,\n", | |
" <th>Borough\n", | |
" </th>,\n", | |
" <th>Neighborhood\n", | |
" </th>]" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import requests\n", | |
"\n", | |
"wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'\n", | |
"raw_random_wikipedia_page=requests.get(wikipedia_link)\n", | |
"markup=raw_random_wikipedia_page.text\n", | |
"soupy = BeautifulSoup(markup, 'html.parser')\n", | |
"#print(soup.prettify())\n", | |
"soup=soupy.find_all('tbody')[0]\n", | |
"table=soup.find_all('th')\n", | |
"table " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#soup.find_all('tr')[1:]\n", | |
"poplu = []\n", | |
"bor = []\n", | |
"neigh = []\n", | |
"\n", | |
"for data in soup.find_all('tr')[1:]:\n", | |
" #soup.find_all('td')[0]\n", | |
" poplu.append(data.find_all('td')[0].text)\n", | |
" bor.append(data.find_all('td')[1].text)\n", | |
" neigh.append(data.find_all('td')[2].text)\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 131, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>PostalCode</th>\n", | |
" <th>Borough</th>\n", | |
" <th>Neighborhood</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>M1A\\n</td>\n", | |
" <td>Not assigned\\n</td>\n", | |
" <td>\\n</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>M2A\\n</td>\n", | |
" <td>Not assigned\\n</td>\n", | |
" <td>\\n</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>M3A\\n</td>\n", | |
" <td>North York\\n</td>\n", | |
" <td>Parkwoods\\n</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>M4A\\n</td>\n", | |
" <td>North York\\n</td>\n", | |
" <td>Victoria Village\\n</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>M5A\\n</td>\n", | |
" <td>Downtown Toronto\\n</td>\n", | |
" <td>Regent Park / Harbourfront\\n</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" PostalCode Borough Neighborhood\n", | |
"0 M1A\\n Not assigned\\n \\n\n", | |
"1 M2A\\n Not assigned\\n \\n\n", | |
"2 M3A\\n North York\\n Parkwoods\\n\n", | |
"3 M4A\\n North York\\n Victoria Village\\n\n", | |
"4 M5A\\n Downtown Toronto\\n Regent Park / Harbourfront\\n" | |
] | |
}, | |
"execution_count": 131, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"poplu\n", | |
"dfObj = pd.DataFrame({'PostalCode': poplu, 'Borough':bor, 'Neighborhood':neigh})\n", | |
"#dfObj['Neighborhood'] = dfObj.Neighborhood.str.replace('(\\\\n)', np.nan)\n", | |
"dfObj.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 132, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dfObj = dfObj.replace('\\n',' ', regex=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 133, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dfObj['Neighborhood'] = dfObj['Neighborhood'].replace(' ', np.nan)\n", | |
"dfObj[dfObj.columns] = dfObj.apply(lambda x: x.str.strip())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 134, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>PostalCode</th>\n", | |
" <th>Borough</th>\n", | |
" <th>Neighborhood</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>M1A</td>\n", | |
" <td>Not assigned</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>M2A</td>\n", | |
" <td>Not assigned</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>M3A</td>\n", | |
" <td>North York</td>\n", | |
" <td>Parkwoods</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>M4A</td>\n", | |
" <td>North York</td>\n", | |
" <td>Victoria Village</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>M5A</td>\n", | |
" <td>Downtown Toronto</td>\n", | |
" <td>Regent Park / Harbourfront</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" PostalCode Borough Neighborhood\n", | |
"0 M1A Not assigned NaN\n", | |
"1 M2A Not assigned NaN\n", | |
"2 M3A North York Parkwoods\n", | |
"3 M4A North York Victoria Village\n", | |
"4 M5A Downtown Toronto Regent Park / Harbourfront" | |
] | |
}, | |
"execution_count": 134, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dfObj.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 135, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dfObj = dfObj[dfObj.Borough != 'Not assigned']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 136, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"PostalCode 0\n", | |
"Borough 0\n", | |
"Neighborhood 0\n", | |
"dtype: int64" | |
] | |
}, | |
"execution_count": 136, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dfObj.isnull().sum()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 139, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(103, 3)\n", | |
" PostalCode Borough Neighborhood\n", | |
"2 M3A North York Parkwoods\n", | |
"3 M4A North York Victoria Village\n", | |
"4 M5A Downtown Toronto Regent Park / Harbourfront\n", | |
"5 M6A North York Lawrence Manor / Lawrence Heights\n", | |
"6 M7A Downtown Toronto Queen's Park / Ontario Provincial Government\n" | |
] | |
} | |
], | |
"source": [ | |
"print(dfObj.shape)\n", | |
"print(dfObj.head())" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python", | |
"language": "python", | |
"name": "conda-env-python-py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment