pskifast/Week3_assignment_Toronto clustering.ipynb

## Week3_assignment_Toronto clustering.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Segmenting and Clustering Neighborhoods in Toronto"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# importing required libraries\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "%matplotlib inline\n",
    "\n",
    "!pip install lxml\n",
    "\n",
    "print('All imported!')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Section 1: Web scraping Wikipedia HTML tables\n",
    "\n",
    "web site: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Parsing the tables of the target webpage\n",
    "data = pd.read_html('http://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header=0)\n",
    "df = data[0]\n",
    "\n",
    "# Changing column names\n",
    "df.columns = ['PostalCode', 'Borough', 'Neighborhood']\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Section 2: Performing required operations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data cleaning per the instructions\n",
    "df.drop(df[df.Borough == 'Not assigned'].index, inplace=True)\n",
    "df.loc[df.Neighborhood == 'Not assigned', 'Neighborhood'] = df['Borough']\n",
    "df.reset_index(drop=True, inplace=True)\n",
    "df = df.groupby('PostalCode').agg({'Borough':'first','Neighborhood': ', '.join}).reset_index()\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Section 3: Verifying results"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Testing if grouping works, by comparing with table given in the instructions**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.loc[df['PostalCode'] == 'M9V']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Printing number of rows of the resulting table**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rows = df.shape\n",
    "print('Final table has',df.shape[0], 'rows.')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python",
   "language": "python",
   "name": "conda-env-python-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Segmenting and Clustering Neighborhoods in Toronto"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# importing required libraries\n",
	"\n",
	"import pandas as pd\n",
	"import numpy as np\n",
	"import matplotlib.pyplot as plt\n",
	"import seaborn as sns\n",
	"%matplotlib inline\n",
	"\n",
	"!pip install lxml\n",
	"\n",
	"print('All imported!')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Section 1: Web scraping Wikipedia HTML tables\n",
	"\n",
	"web site: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Parsing the tables of the target webpage\n",
	"data = pd.read_html('http://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header=0)\n",
	"df = data[0]\n",
	"\n",
	"# Changing column names\n",
	"df.columns = ['PostalCode', 'Borough', 'Neighborhood']\n",
	"df"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Section 2: Performing required operations"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Data cleaning per the instructions\n",
	"df.drop(df[df.Borough == 'Not assigned'].index, inplace=True)\n",
	"df.loc[df.Neighborhood == 'Not assigned', 'Neighborhood'] = df['Borough']\n",
	"df.reset_index(drop=True, inplace=True)\n",
	"df = df.groupby('PostalCode').agg({'Borough':'first','Neighborhood': ', '.join}).reset_index()\n",
	"\n",
	"df"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Section 3: Verifying results"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Testing if grouping works, by comparing with table given in the instructions"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df.loc[df['PostalCode'] == 'M9V']"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Printing number of rows of the resulting table"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"rows = df.shape\n",
	"print('Final table has',df.shape[0], 'rows.')"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python",
	"language": "python",
	"name": "conda-env-python-py"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.7"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}