Skip to content

Instantly share code, notes, and snippets.

@abhiray92
Created May 7, 2020 13:21
Show Gist options
  • Save abhiray92/63b91475dfb7c1611a3b335bd4dc802f to your computer and use it in GitHub Desktop.
Save abhiray92/63b91475dfb7c1611a3b335bd4dc802f to your computer and use it in GitHub Desktop.
Created on Skills Network Labs
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting package metadata (current_repodata.json): done\n",
"Solving environment: done\n",
"\n",
"## Package Plan ##\n",
"\n",
" environment location: /home/jupyterlab/conda/envs/python\n",
"\n",
" added / updated specs:\n",
" - beautifulsoup4\n",
"\n",
"\n",
"The following packages will be downloaded:\n",
"\n",
" package | build\n",
" ---------------------------|-----------------\n",
" beautifulsoup4-4.9.0 | py36_0 167 KB\n",
" ca-certificates-2020.1.1 | 0 125 KB\n",
" certifi-2020.4.5.1 | py36_0 155 KB\n",
" openssl-1.1.1g | h7b6447c_0 2.5 MB\n",
" soupsieve-2.0 | py_0 33 KB\n",
" ------------------------------------------------------------\n",
" Total: 3.0 MB\n",
"\n",
"The following NEW packages will be INSTALLED:\n",
"\n",
" beautifulsoup4 pkgs/main/linux-64::beautifulsoup4-4.9.0-py36_0\n",
" soupsieve pkgs/main/noarch::soupsieve-2.0-py_0\n",
"\n",
"The following packages will be UPDATED:\n",
"\n",
" openssl conda-forge::openssl-1.1.1f-h516909a_0 --> pkgs/main::openssl-1.1.1g-h7b6447c_0\n",
"\n",
"The following packages will be SUPERSEDED by a higher-priority channel:\n",
"\n",
" ca-certificates conda-forge::ca-certificates-2020.4.5~ --> pkgs/main::ca-certificates-2020.1.1-0\n",
" certifi conda-forge::certifi-2020.4.5.1-py36h~ --> pkgs/main::certifi-2020.4.5.1-py36_0\n",
"\n",
"\n",
"\n",
"Downloading and Extracting Packages\n",
"ca-certificates-2020 | 125 KB | ##################################### | 100% \n",
"beautifulsoup4-4.9.0 | 167 KB | ##################################### | 100% \n",
"openssl-1.1.1g | 2.5 MB | ##################################### | 100% \n",
"soupsieve-2.0 | 33 KB | ##################################### | 100% \n",
"certifi-2020.4.5.1 | 155 KB | ##################################### | 100% \n",
"Preparing transaction: done\n",
"Verifying transaction: done\n",
"Executing transaction: done\n"
]
}
],
"source": [
"!conda install beautifulsoup4 -y"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<th>Postal Code\n",
" </th>,\n",
" <th>Borough\n",
" </th>,\n",
" <th>Neighborhood\n",
" </th>]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import requests\n",
"\n",
"wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'\n",
"raw_random_wikipedia_page=requests.get(wikipedia_link)\n",
"markup=raw_random_wikipedia_page.text\n",
"soupy = BeautifulSoup(markup, 'html.parser')\n",
"#print(soup.prettify())\n",
"soup=soupy.find_all('tbody')[0]\n",
"table=soup.find_all('th')\n",
"table "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"#soup.find_all('tr')[1:]\n",
"poplu = []\n",
"bor = []\n",
"neigh = []\n",
"\n",
"for data in soup.find_all('tr')[1:]:\n",
" #soup.find_all('td')[0]\n",
" poplu.append(data.find_all('td')[0].text)\n",
" bor.append(data.find_all('td')[1].text)\n",
" neigh.append(data.find_all('td')[2].text)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PostalCode</th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>M1A\\n</td>\n",
" <td>Not assigned\\n</td>\n",
" <td>\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>M2A\\n</td>\n",
" <td>Not assigned\\n</td>\n",
" <td>\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M3A\\n</td>\n",
" <td>North York\\n</td>\n",
" <td>Parkwoods\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M4A\\n</td>\n",
" <td>North York\\n</td>\n",
" <td>Victoria Village\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>M5A\\n</td>\n",
" <td>Downtown Toronto\\n</td>\n",
" <td>Regent Park / Harbourfront\\n</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PostalCode Borough Neighborhood\n",
"0 M1A\\n Not assigned\\n \\n\n",
"1 M2A\\n Not assigned\\n \\n\n",
"2 M3A\\n North York\\n Parkwoods\\n\n",
"3 M4A\\n North York\\n Victoria Village\\n\n",
"4 M5A\\n Downtown Toronto\\n Regent Park / Harbourfront\\n"
]
},
"execution_count": 131,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"poplu\n",
"dfObj = pd.DataFrame({'PostalCode': poplu, 'Borough':bor, 'Neighborhood':neigh})\n",
"#dfObj['Neighborhood'] = dfObj.Neighborhood.str.replace('(\\\\n)', np.nan)\n",
"dfObj.head()"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {},
"outputs": [],
"source": [
"dfObj = dfObj.replace('\\n',' ', regex=True)"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {},
"outputs": [],
"source": [
"dfObj['Neighborhood'] = dfObj['Neighborhood'].replace(' ', np.nan)\n",
"dfObj[dfObj.columns] = dfObj.apply(lambda x: x.str.strip())"
]
},
{
"cell_type": "code",
"execution_count": 134,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PostalCode</th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>M1A</td>\n",
" <td>Not assigned</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>M2A</td>\n",
" <td>Not assigned</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M3A</td>\n",
" <td>North York</td>\n",
" <td>Parkwoods</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M4A</td>\n",
" <td>North York</td>\n",
" <td>Victoria Village</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>M5A</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Regent Park / Harbourfront</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PostalCode Borough Neighborhood\n",
"0 M1A Not assigned NaN\n",
"1 M2A Not assigned NaN\n",
"2 M3A North York Parkwoods\n",
"3 M4A North York Victoria Village\n",
"4 M5A Downtown Toronto Regent Park / Harbourfront"
]
},
"execution_count": 134,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfObj.head()"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {},
"outputs": [],
"source": [
"dfObj = dfObj[dfObj.Borough != 'Not assigned']"
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"PostalCode 0\n",
"Borough 0\n",
"Neighborhood 0\n",
"dtype: int64"
]
},
"execution_count": 136,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfObj.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 139,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(103, 3)\n",
" PostalCode Borough Neighborhood\n",
"2 M3A North York Parkwoods\n",
"3 M4A North York Victoria Village\n",
"4 M5A Downtown Toronto Regent Park / Harbourfront\n",
"5 M6A North York Lawrence Manor / Lawrence Heights\n",
"6 M7A Downtown Toronto Queen's Park / Ontario Provincial Government\n"
]
}
],
"source": [
"print(dfObj.shape)\n",
"print(dfObj.head())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python",
"language": "python",
"name": "conda-env-python-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment