namdoan194/Segmenting and Clustering in Toronto.ipynb

## Segmenting and Clustering in Toronto.ipynb
{
    "cells": [
        {
            "metadata": {},
            "cell_type": "markdown",
            "source": "**Notebook Created**"
        },
        {
            "metadata": {},
            "cell_type": "code",
            "source": "import numpy as np # library to handle data in a vectorized manner\nimport pandas as pd # library for data analsysis\nimport requests # Library for web scraping\n\nprint('Libraries imported.')",
            "execution_count": 105,
            "outputs": [
                {
                    "output_type": "stream",
                    "text": "Libraries imported.\n",
                    "name": "stdout"
                }
            ]
        },
        {
            "metadata": {},
            "cell_type": "markdown",
            "source": "**Wrangling Website**"
        },
        {
            "metadata": {},
            "cell_type": "code",
            "source": "import requests\nfrom urllib.request import urlopen\nfrom bs4 import BeautifulSoup\nimport ssl\nimport csv\n\nprint('BeautifulSoup  & csv imported.')",
            "execution_count": 106,
            "outputs": [
                {
                    "output_type": "stream",
                    "text": "BeautifulSoup  & csv imported.\n",
                    "name": "stdout"
                }
            ]
        },
        {
            "metadata": {},
            "cell_type": "code",
            "source": "ctx = ssl.create_default_context()\nctx.check_hostname = False\nctx.verify_mode = ssl.CERT_NONE\n\nprint('SSL certificate errors ignored.')",
            "execution_count": 107,
            "outputs": [
                {
                    "output_type": "stream",
                    "text": "SSL certificate errors ignored.\n",
                    "name": "stdout"
                }
            ]
        },
        {
            "metadata": {},
            "cell_type": "code",
            "source": "source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text\n\nsoup = BeautifulSoup(source, 'lxml')\n",
            "execution_count": 141,
            "outputs": []
        },
        {
            "metadata": {},
            "cell_type": "code",
            "source": "table_rows = table.find_all('tr')\ntable = soup.find('table',{'class':'wikitable sortable'})\n\ndata = []\nfor row in table_rows:\n    data.append([t.text.strip() for t in row.find_all('td')])\n\ndf = pd.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])\ndf = df[~df['PostalCode'].isnull()]  ",
            "execution_count": 142,
            "outputs": []
        },
        {
            "metadata": {},
            "cell_type": "markdown",
            "source": "**Data Transformed into pandas DataFrame**"
        },
        {
            "metadata": {},
            "cell_type": "code",
            "source": "df.info()",
            "execution_count": 143,
            "outputs": [
                {
                    "output_type": "stream",
                    "text": "<class 'pandas.core.frame.DataFrame'>\nInt64Index: 180 entries, 1 to 180\nData columns (total 3 columns):\nPostalCode       180 non-null object\nBorough          180 non-null object\nNeighbourhood    180 non-null object\ndtypes: object(3)\nmemory usage: 5.6+ KB\n",
                    "name": "stdout"
                }
            ]
        },
        {
            "metadata": {},
            "cell_type": "code",
            "source": "df.shape",
            "execution_count": 144,
            "outputs": [
                {
                    "output_type": "execute_result",
                    "execution_count": 144,
                    "data": {
                        "text/plain": "(180, 3)"
                    },
                    "metadata": {}
                }
            ]
        },
        {
            "metadata": {},
            "cell_type": "code",
            "source": "",
            "execution_count": null,
            "outputs": []
        },
        {
            "metadata": {},
            "cell_type": "code",
            "source": "import pandas\nimport requests\nfrom bs4 import BeautifulSoup\nwebsite_text = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text\nsoup = BeautifulSoup(website_text,'lxml')\n\ntable = soup.find('table',{'class':'wikitable sortable'})\ntable_rows = table.find_all('tr')\n\ndata = []\nfor row in table_rows:\n    data.append([t.text.strip() for t in row.find_all('td')])\n\ndf = pandas.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])\ndf = df[~df['PostalCode'].isnull()]  # to filter out bad rows\n",
            "execution_count": 145,
            "outputs": []
        },
        {
            "metadata": {},
            "cell_type": "code",
            "source": "df1 = df.reset_index()",
            "execution_count": 146,
            "outputs": []
        },
        {
            "metadata": {},
            "cell_type": "code",
            "source": "df2= df1.groupby('PostalCode').agg(lambda x: ','.join(x))",
            "execution_count": 147,
            "outputs": []
        },
        {
            "metadata": {},
            "cell_type": "code",
            "source": "df2.loc[df2['Neighbourhood']==\"Not assigned\",'Neighbourhood']=df2.loc[df2['Neighbourhood']==\"Not assigned\",'Borough']\n\n#df2.head()",
            "execution_count": 148,
            "outputs": []
        },
        {
            "metadata": {},
            "cell_type": "code",
            "source": "df2.loc[df2['Neighbourhood']==\"Not assigned\",'Neighbourhood']=df2.loc[df2['Neighbourhood']==\"Not assigned\",'Borough']\n",
            "execution_count": 149,
            "outputs": []
        },
        {
            "metadata": {},
            "cell_type": "code",
            "source": "df3 = df2.reset_index()",
            "execution_count": 150,
            "outputs": []
        },
        {
            "metadata": {},
            "cell_type": "code",
            "source": "df3['Borough']= df3['Borough'].str.replace('nan|[{}\\s]','').str.split(',').apply(set).str.join(',').str.strip(',').str.replace(\",{2,}\",\",\")",
            "execution_count": 151,
            "outputs": []
        },
        {
            "metadata": {},
            "cell_type": "code",
            "source": "df3.head()",
            "execution_count": 152,
            "outputs": [
                {
                    "output_type": "execute_result",
                    "execution_count": 152,
                    "data": {
                        "text/plain": "  PostalCode      Borough                           Neighbourhood\n0        M1A  Notassigned                            Not assigned\n1        M1B  Scarborough                          Malvern, Rouge\n2        M1C  Scarborough  Rouge Hill, Port Union, Highland Creek\n3        M1E  Scarborough       Guildwood, Morningside, West Hill\n4        M1G  Scarborough                                  Woburn",
                        "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>PostalCode</th>\n      <th>Borough</th>\n      <th>Neighbourhood</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>M1A</td>\n      <td>Notassigned</td>\n      <td>Not assigned</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>M1B</td>\n      <td>Scarborough</td>\n      <td>Malvern, Rouge</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>M1C</td>\n      <td>Scarborough</td>\n      <td>Rouge Hill, Port Union, Highland Creek</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>M1E</td>\n      <td>Scarborough</td>\n      <td>Guildwood, Morningside, West Hill</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>M1G</td>\n      <td>Scarborough</td>\n      <td>Woburn</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
                    },
                    "metadata": {}
                }
            ]
        },
        {
            "metadata": {},
            "cell_type": "code",
            "source": "df4 = df3[df3.Neighbourhood != 'Not assigned']",
            "execution_count": 154,
            "outputs": []
        },
        {
            "metadata": {},
            "cell_type": "code",
            "source": "df4.head(10)",
            "execution_count": 156,
            "outputs": [
                {
                    "output_type": "execute_result",
                    "execution_count": 156,
                    "data": {
                        "text/plain": "   PostalCode      Borough                                    Neighbourhood\n1         M1B  Scarborough                                   Malvern, Rouge\n2         M1C  Scarborough           Rouge Hill, Port Union, Highland Creek\n3         M1E  Scarborough                Guildwood, Morningside, West Hill\n4         M1G  Scarborough                                           Woburn\n5         M1H  Scarborough                                        Cedarbrae\n6         M1J  Scarborough                              Scarborough Village\n7         M1K  Scarborough      Kennedy Park, Ionview, East Birchmount Park\n8         M1L  Scarborough                  Golden Mile, Clairlea, Oakridge\n9         M1M  Scarborough  Cliffside, Cliffcrest, Scarborough Village West\n10        M1N  Scarborough                      Birch Cliff, Cliffside West",
                        "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>PostalCode</th>\n      <th>Borough</th>\n      <th>Neighbourhood</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>1</th>\n      <td>M1B</td>\n      <td>Scarborough</td>\n      <td>Malvern, Rouge</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>M1C</td>\n      <td>Scarborough</td>\n      <td>Rouge Hill, Port Union, Highland Creek</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>M1E</td>\n      <td>Scarborough</td>\n      <td>Guildwood, Morningside, West Hill</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>M1G</td>\n      <td>Scarborough</td>\n      <td>Woburn</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>M1H</td>\n      <td>Scarborough</td>\n      <td>Cedarbrae</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>M1J</td>\n      <td>Scarborough</td>\n      <td>Scarborough Village</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>M1K</td>\n      <td>Scarborough</td>\n      <td>Kennedy Park, Ionview, East Birchmount Park</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>M1L</td>\n      <td>Scarborough</td>\n      <td>Golden Mile, Clairlea, Oakridge</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>M1M</td>\n      <td>Scarborough</td>\n      <td>Cliffside, Cliffcrest, Scarborough Village West</td>\n    </tr>\n    <tr>\n      <th>10</th>\n      <td>M1N</td>\n      <td>Scarborough</td>\n      <td>Birch Cliff, Cliffside West</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
                    },
                    "metadata": {}
                }
            ]
        },
        {
            "metadata": {},
            "cell_type": "code",
            "source": "",
            "execution_count": null,
            "outputs": []
        }
    ],
    "metadata": {
        "kernelspec": {
            "name": "python3",
            "display_name": "Python 3.6",
            "language": "python"
        },
        "language_info": {
            "name": "python",
            "version": "3.6.9",
            "mimetype": "text/x-python",
            "codemirror_mode": {
                "name": "ipython",
                "version": 3
            },
            "pygments_lexer": "ipython3",
            "nbconvert_exporter": "python",
            "file_extension": ".py"
        }
    },
    "nbformat": 4,
    "nbformat_minor": 1
}
	{
	"cells": [
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "Notebook Created"
	},
	{
	"metadata": {},
	"cell_type": "code",
	"source": "import numpy as np # library to handle data in a vectorized manner\nimport pandas as pd # library for data analsysis\nimport requests # Library for web scraping\n\nprint('Libraries imported.')",
	"execution_count": 105,
	"outputs": [
	{
	"output_type": "stream",
	"text": "Libraries imported.\n",
	"name": "stdout"
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "Wrangling Website"
	},
	{
	"metadata": {},
	"cell_type": "code",
	"source": "import requests\nfrom urllib.request import urlopen\nfrom bs4 import BeautifulSoup\nimport ssl\nimport csv\n\nprint('BeautifulSoup & csv imported.')",
	"execution_count": 106,
	"outputs": [
	{
	"output_type": "stream",
	"text": "BeautifulSoup & csv imported.\n",
	"name": "stdout"
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "code",
	"source": "ctx = ssl.create_default_context()\nctx.check_hostname = False\nctx.verify_mode = ssl.CERT_NONE\n\nprint('SSL certificate errors ignored.')",
	"execution_count": 107,
	"outputs": [
	{
	"output_type": "stream",
	"text": "SSL certificate errors ignored.\n",
	"name": "stdout"
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "code",
	"source": "source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text\n\nsoup = BeautifulSoup(source, 'lxml')\n",
	"execution_count": 141,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "code",
	"source": "table_rows = table.find_all('tr')\ntable = soup.find('table',{'class':'wikitable sortable'})\n\ndata = []\nfor row in table_rows:\n data.append([t.text.strip() for t in row.find_all('td')])\n\ndf = pd.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])\ndf = df[~df['PostalCode'].isnull()] ",
	"execution_count": 142,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "Data Transformed into pandas DataFrame"
	},
	{
	"metadata": {},
	"cell_type": "code",
	"source": "df.info()",
	"execution_count": 143,
	"outputs": [
	{
	"output_type": "stream",
	"text": "<class 'pandas.core.frame.DataFrame'>\nInt64Index: 180 entries, 1 to 180\nData columns (total 3 columns):\nPostalCode 180 non-null object\nBorough 180 non-null object\nNeighbourhood 180 non-null object\ndtypes: object(3)\nmemory usage: 5.6+ KB\n",
	"name": "stdout"
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "code",
	"source": "df.shape",
	"execution_count": 144,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 144,
	"data": {
	"text/plain": "(180, 3)"
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "code",
	"source": "",
	"execution_count": null,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "code",
	"source": "import pandas\nimport requests\nfrom bs4 import BeautifulSoup\nwebsite_text = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text\nsoup = BeautifulSoup(website_text,'lxml')\n\ntable = soup.find('table',{'class':'wikitable sortable'})\ntable_rows = table.find_all('tr')\n\ndata = []\nfor row in table_rows:\n data.append([t.text.strip() for t in row.find_all('td')])\n\ndf = pandas.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])\ndf = df[~df['PostalCode'].isnull()] # to filter out bad rows\n",
	"execution_count": 145,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "code",
	"source": "df1 = df.reset_index()",
	"execution_count": 146,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "code",
	"source": "df2= df1.groupby('PostalCode').agg(lambda x: ','.join(x))",
	"execution_count": 147,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "code",
	"source": "df2.loc[df2['Neighbourhood']==\"Not assigned\",'Neighbourhood']=df2.loc[df2['Neighbourhood']==\"Not assigned\",'Borough']\n\n#df2.head()",
	"execution_count": 148,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "code",
	"source": "df2.loc[df2['Neighbourhood']==\"Not assigned\",'Neighbourhood']=df2.loc[df2['Neighbourhood']==\"Not assigned\",'Borough']\n",
	"execution_count": 149,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "code",
	"source": "df3 = df2.reset_index()",
	"execution_count": 150,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "code",
	"source": "df3['Borough']= df3['Borough'].str.replace('nan\|[{}\\s]','').str.split(',').apply(set).str.join(',').str.strip(',').str.replace(\",{2,}\",\",\")",
	"execution_count": 151,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "code",
	"source": "df3.head()",
	"execution_count": 152,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 152,
	"data": {
	"text/plain": " PostalCode Borough Neighbourhood\n0 M1A Notassigned Not assigned\n1 M1B Scarborough Malvern, Rouge\n2 M1C Scarborough Rouge Hill, Port Union, Highland Creek\n3 M1E Scarborough Guildwood, Morningside, West Hill\n4 M1G Scarborough Woburn",
	"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>PostalCode</th>\n <th>Borough</th>\n <th>Neighbourhood</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>M1A</td>\n <td>Notassigned</td>\n <td>Not assigned</td>\n </tr>\n <tr>\n <th>1</th>\n <td>M1B</td>\n <td>Scarborough</td>\n <td>Malvern, Rouge</td>\n </tr>\n <tr>\n <th>2</th>\n <td>M1C</td>\n <td>Scarborough</td>\n <td>Rouge Hill, Port Union, Highland Creek</td>\n </tr>\n <tr>\n <th>3</th>\n <td>M1E</td>\n <td>Scarborough</td>\n <td>Guildwood, Morningside, West Hill</td>\n </tr>\n <tr>\n <th>4</th>\n <td>M1G</td>\n <td>Scarborough</td>\n <td>Woburn</td>\n </tr>\n </tbody>\n</table>\n</div>"
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "code",
	"source": "df4 = df3[df3.Neighbourhood != 'Not assigned']",
	"execution_count": 154,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "code",
	"source": "df4.head(10)",
	"execution_count": 156,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 156,
	"data": {
	"text/plain": " PostalCode Borough Neighbourhood\n1 M1B Scarborough Malvern, Rouge\n2 M1C Scarborough Rouge Hill, Port Union, Highland Creek\n3 M1E Scarborough Guildwood, Morningside, West Hill\n4 M1G Scarborough Woburn\n5 M1H Scarborough Cedarbrae\n6 M1J Scarborough Scarborough Village\n7 M1K Scarborough Kennedy Park, Ionview, East Birchmount Park\n8 M1L Scarborough Golden Mile, Clairlea, Oakridge\n9 M1M Scarborough Cliffside, Cliffcrest, Scarborough Village West\n10 M1N Scarborough Birch Cliff, Cliffside West",
	"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>PostalCode</th>\n <th>Borough</th>\n <th>Neighbourhood</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1</th>\n <td>M1B</td>\n <td>Scarborough</td>\n <td>Malvern, Rouge</td>\n </tr>\n <tr>\n <th>2</th>\n <td>M1C</td>\n <td>Scarborough</td>\n <td>Rouge Hill, Port Union, Highland Creek</td>\n </tr>\n <tr>\n <th>3</th>\n <td>M1E</td>\n <td>Scarborough</td>\n <td>Guildwood, Morningside, West Hill</td>\n </tr>\n <tr>\n <th>4</th>\n <td>M1G</td>\n <td>Scarborough</td>\n <td>Woburn</td>\n </tr>\n <tr>\n <th>5</th>\n <td>M1H</td>\n <td>Scarborough</td>\n <td>Cedarbrae</td>\n </tr>\n <tr>\n <th>6</th>\n <td>M1J</td>\n <td>Scarborough</td>\n <td>Scarborough Village</td>\n </tr>\n <tr>\n <th>7</th>\n <td>M1K</td>\n <td>Scarborough</td>\n <td>Kennedy Park, Ionview, East Birchmount Park</td>\n </tr>\n <tr>\n <th>8</th>\n <td>M1L</td>\n <td>Scarborough</td>\n <td>Golden Mile, Clairlea, Oakridge</td>\n </tr>\n <tr>\n <th>9</th>\n <td>M1M</td>\n <td>Scarborough</td>\n <td>Cliffside, Cliffcrest, Scarborough Village West</td>\n </tr>\n <tr>\n <th>10</th>\n <td>M1N</td>\n <td>Scarborough</td>\n <td>Birch Cliff, Cliffside West</td>\n </tr>\n </tbody>\n</table>\n</div>"
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "code",
	"source": "",
	"execution_count": null,
	"outputs": []
	}
	],
	"metadata": {
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3.6",
	"language": "python"
	},
	"language_info": {
	"name": "python",
	"version": "3.6.9",
	"mimetype": "text/x-python",
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"pygments_lexer": "ipython3",
	"nbconvert_exporter": "python",
	"file_extension": ".py"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}