Skip to content

Instantly share code, notes, and snippets.

@georgehc
Created November 5, 2024 19:12
Show Gist options
  • Save georgehc/735bb60dc663e1f8e0510a4688d5e70c to your computer and use it in GitHub Desktop.
Save georgehc/735bb60dc663e1f8e0510a4688d5e70c to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 95-865: More on PCA, Argsort\n",
"Author: Erick Rodriguez (erickger [at symbol] cmu.edu), former TA for 95-865 <br>\n",
"Modified by George H. Chen (georgechen [at symbol] cmu.edu), Nov 5, 2024\n",
"\n",
"This demo is based on Mark Richardson's 2009 \"Principle Component Analysis\" notes and uses data he pulled from DEFRA on 1997 UK food consumption (grams/person/week). This dataset is also used as a nice illustrated example of PCA here:\n",
"http://setosa.io/ev/principal-component-analysis/"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Creating the dataset"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"plt.style.use('seaborn-v0_8') # prettier plots\n",
"import numpy as np\n",
"\n",
"# grams per person per week\n",
"food_data = np.array([[105, 103, 103, 66],\n",
" [245, 227, 242, 267],\n",
" [685, 803, 750, 586],\n",
" [147, 160, 122, 93],\n",
" [193, 235, 184, 209], \n",
" [156, 175, 147, 139],\n",
" [720, 874, 566, 1033],\n",
" [253, 265, 171, 143],\n",
" [488, 570, 418, 355],\n",
" [198, 203, 220, 187],\n",
" [360, 365, 337, 334],\n",
" [1102, 1137, 957, 674],\n",
" [1472, 1582, 1462, 1494],\n",
" [57, 73, 53, 47],\n",
" [1374, 1256, 1572, 1506],\n",
" [375, 475, 458, 135],\n",
" [54, 64, 62, 41]])\n",
"row_labels = ['Cheese',\n",
" 'Carcass meat',\n",
" 'Other meat',\n",
" 'Fish',\n",
" 'Fats and oils',\n",
" 'Sugars',\n",
" 'Fresh potatoes',\n",
" 'Fresh Veg',\n",
" 'Other Veg',\n",
" 'Processed potatoes',\n",
" 'Processed Veg',\n",
" 'Fresh fruit',\n",
" 'Cereals',\n",
" 'Beverages',\n",
" 'Soft drinks',\n",
" 'Alcoholic drinks',\n",
" 'Confectionary']\n",
"column_labels = ['England', 'Wales', 'Scotland', 'N. Ireland']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Looking at the table with a dataframe"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"A module that was compiled using NumPy 1.x cannot be run in\n",
"NumPy 2.0.2 as it may crash. To support both 1.x and 2.x\n",
"versions of NumPy, modules must be compiled with NumPy 2.0.\n",
"Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.\n",
"\n",
"If you are a user of the module, the easiest solution will be to\n",
"downgrade to 'numpy<2' or try to upgrade the affected module.\n",
"We expect that some modules will need time to support NumPy 2.\n",
"\n",
"Traceback (most recent call last): File \"<frozen runpy>\", line 198, in _run_module_as_main\n",
" File \"<frozen runpy>\", line 88, in _run_code\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/ipykernel_launcher.py\", line 17, in <module>\n",
" app.launch_new_instance()\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/traitlets/config/application.py\", line 992, in launch_instance\n",
" app.start()\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/ipykernel/kernelapp.py\", line 736, in start\n",
" self.io_loop.start()\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/tornado/platform/asyncio.py\", line 195, in start\n",
" self.asyncio_loop.run_forever()\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/asyncio/base_events.py\", line 607, in run_forever\n",
" self._run_once()\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/asyncio/base_events.py\", line 1922, in _run_once\n",
" handle._run()\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/asyncio/events.py\", line 80, in _run\n",
" self._context.run(self._callback, *self._args)\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/ipykernel/kernelbase.py\", line 516, in dispatch_queue\n",
" await self.process_one()\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/ipykernel/kernelbase.py\", line 505, in process_one\n",
" await dispatch(*args)\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/ipykernel/kernelbase.py\", line 412, in dispatch_shell\n",
" await result\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/ipykernel/kernelbase.py\", line 740, in execute_request\n",
" reply_content = await reply_content\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/ipykernel/ipkernel.py\", line 422, in do_execute\n",
" res = shell.run_cell(\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/ipykernel/zmqshell.py\", line 546, in run_cell\n",
" return super().run_cell(*args, **kwargs)\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py\", line 3024, in run_cell\n",
" result = self._run_cell(\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py\", line 3079, in _run_cell\n",
" result = runner(coro)\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/IPython/core/async_helpers.py\", line 129, in _pseudo_sync_runner\n",
" coro.send(None)\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py\", line 3284, in run_cell_async\n",
" has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py\", line 3466, in run_ast_nodes\n",
" if await self.run_code(code, result, async_=asy):\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py\", line 3526, in run_code\n",
" exec(code_obj, self.user_global_ns, self.user_ns)\n",
" File \"/var/folders/ms/yzwxs7r54nx0q978svcpsnbh0000gn/T/ipykernel_84926/3546127722.py\", line 1, in <module>\n",
" import pandas as pd\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/pandas/__init__.py\", line 26, in <module>\n",
" from pandas.compat import (\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/pandas/compat/__init__.py\", line 27, in <module>\n",
" from pandas.compat.pyarrow import (\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/pandas/compat/pyarrow.py\", line 8, in <module>\n",
" import pyarrow as pa\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/pyarrow/__init__.py\", line 65, in <module>\n",
" import pyarrow.lib as _lib\n"
]
},
{
"ename": "AttributeError",
"evalue": "_ARRAY_API not found",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;31mAttributeError\u001b[0m: _ARRAY_API not found"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"A module that was compiled using NumPy 1.x cannot be run in\n",
"NumPy 2.0.2 as it may crash. To support both 1.x and 2.x\n",
"versions of NumPy, modules must be compiled with NumPy 2.0.\n",
"Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.\n",
"\n",
"If you are a user of the module, the easiest solution will be to\n",
"downgrade to 'numpy<2' or try to upgrade the affected module.\n",
"We expect that some modules will need time to support NumPy 2.\n",
"\n",
"Traceback (most recent call last): File \"<frozen runpy>\", line 198, in _run_module_as_main\n",
" File \"<frozen runpy>\", line 88, in _run_code\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/ipykernel_launcher.py\", line 17, in <module>\n",
" app.launch_new_instance()\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/traitlets/config/application.py\", line 992, in launch_instance\n",
" app.start()\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/ipykernel/kernelapp.py\", line 736, in start\n",
" self.io_loop.start()\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/tornado/platform/asyncio.py\", line 195, in start\n",
" self.asyncio_loop.run_forever()\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/asyncio/base_events.py\", line 607, in run_forever\n",
" self._run_once()\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/asyncio/base_events.py\", line 1922, in _run_once\n",
" handle._run()\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/asyncio/events.py\", line 80, in _run\n",
" self._context.run(self._callback, *self._args)\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/ipykernel/kernelbase.py\", line 516, in dispatch_queue\n",
" await self.process_one()\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/ipykernel/kernelbase.py\", line 505, in process_one\n",
" await dispatch(*args)\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/ipykernel/kernelbase.py\", line 412, in dispatch_shell\n",
" await result\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/ipykernel/kernelbase.py\", line 740, in execute_request\n",
" reply_content = await reply_content\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/ipykernel/ipkernel.py\", line 422, in do_execute\n",
" res = shell.run_cell(\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/ipykernel/zmqshell.py\", line 546, in run_cell\n",
" return super().run_cell(*args, **kwargs)\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py\", line 3024, in run_cell\n",
" result = self._run_cell(\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py\", line 3079, in _run_cell\n",
" result = runner(coro)\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/IPython/core/async_helpers.py\", line 129, in _pseudo_sync_runner\n",
" coro.send(None)\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py\", line 3284, in run_cell_async\n",
" has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py\", line 3466, in run_ast_nodes\n",
" if await self.run_code(code, result, async_=asy):\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py\", line 3526, in run_code\n",
" exec(code_obj, self.user_global_ns, self.user_ns)\n",
" File \"/var/folders/ms/yzwxs7r54nx0q978svcpsnbh0000gn/T/ipykernel_84926/3546127722.py\", line 1, in <module>\n",
" import pandas as pd\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/pandas/__init__.py\", line 49, in <module>\n",
" from pandas.core.api import (\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/pandas/core/api.py\", line 9, in <module>\n",
" from pandas.core.dtypes.dtypes import (\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/pandas/core/dtypes/dtypes.py\", line 24, in <module>\n",
" from pandas._libs import (\n",
" File \"/Users/georgehc/anaconda3/lib/python3.11/site-packages/pyarrow/__init__.py\", line 65, in <module>\n",
" import pyarrow.lib as _lib\n"
]
},
{
"ename": "AttributeError",
"evalue": "_ARRAY_API not found",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;31mAttributeError\u001b[0m: _ARRAY_API not found"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>England</th>\n",
" <th>Wales</th>\n",
" <th>Scotland</th>\n",
" <th>N. Ireland</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Cheese</th>\n",
" <td>105</td>\n",
" <td>103</td>\n",
" <td>103</td>\n",
" <td>66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Carcass meat</th>\n",
" <td>245</td>\n",
" <td>227</td>\n",
" <td>242</td>\n",
" <td>267</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Other meat</th>\n",
" <td>685</td>\n",
" <td>803</td>\n",
" <td>750</td>\n",
" <td>586</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Fish</th>\n",
" <td>147</td>\n",
" <td>160</td>\n",
" <td>122</td>\n",
" <td>93</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Fats and oils</th>\n",
" <td>193</td>\n",
" <td>235</td>\n",
" <td>184</td>\n",
" <td>209</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Sugars</th>\n",
" <td>156</td>\n",
" <td>175</td>\n",
" <td>147</td>\n",
" <td>139</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Fresh potatoes</th>\n",
" <td>720</td>\n",
" <td>874</td>\n",
" <td>566</td>\n",
" <td>1033</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Fresh Veg</th>\n",
" <td>253</td>\n",
" <td>265</td>\n",
" <td>171</td>\n",
" <td>143</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Other Veg</th>\n",
" <td>488</td>\n",
" <td>570</td>\n",
" <td>418</td>\n",
" <td>355</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Processed potatoes</th>\n",
" <td>198</td>\n",
" <td>203</td>\n",
" <td>220</td>\n",
" <td>187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Processed Veg</th>\n",
" <td>360</td>\n",
" <td>365</td>\n",
" <td>337</td>\n",
" <td>334</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Fresh fruit</th>\n",
" <td>1102</td>\n",
" <td>1137</td>\n",
" <td>957</td>\n",
" <td>674</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Cereals</th>\n",
" <td>1472</td>\n",
" <td>1582</td>\n",
" <td>1462</td>\n",
" <td>1494</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Beverages</th>\n",
" <td>57</td>\n",
" <td>73</td>\n",
" <td>53</td>\n",
" <td>47</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Soft drinks</th>\n",
" <td>1374</td>\n",
" <td>1256</td>\n",
" <td>1572</td>\n",
" <td>1506</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Alcoholic drinks</th>\n",
" <td>375</td>\n",
" <td>475</td>\n",
" <td>458</td>\n",
" <td>135</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Confectionary</th>\n",
" <td>54</td>\n",
" <td>64</td>\n",
" <td>62</td>\n",
" <td>41</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" England Wales Scotland N. Ireland\n",
"Cheese 105 103 103 66\n",
"Carcass meat 245 227 242 267\n",
"Other meat 685 803 750 586\n",
"Fish 147 160 122 93\n",
"Fats and oils 193 235 184 209\n",
"Sugars 156 175 147 139\n",
"Fresh potatoes 720 874 566 1033\n",
"Fresh Veg 253 265 171 143\n",
"Other Veg 488 570 418 355\n",
"Processed potatoes 198 203 220 187\n",
"Processed Veg 360 365 337 334\n",
"Fresh fruit 1102 1137 957 674\n",
"Cereals 1472 1582 1462 1494\n",
"Beverages 57 73 53 47\n",
"Soft drinks 1374 1256 1572 1506\n",
"Alcoholic drinks 375 475 458 135\n",
"Confectionary 54 64 62 41"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"food_df = pd.DataFrame(food_data, columns=column_labels, index=row_labels)\n",
"food_df.head(20)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(17, 4)\n"
]
}
],
"source": [
"print(np.shape(food_df))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Running PCA"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 800x550 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.decomposition import PCA\n",
"\n",
"# instantiate PCA class\n",
"single_dimension_pca = PCA(n_components=1)\n",
"# use our pca to fit and transform the whole dataset\n",
"single_dimension_food_data = single_dimension_pca.fit_transform(food_data.T)\n",
"\n",
"# matplotlib doesn't have a built-in 1D scatter plot but we can\n",
"# just use a 2D scatter plot with y-axis values all set to 0\n",
"y_axis_all_zeros = np.zeros(len(single_dimension_food_data))\n",
"plt.scatter(single_dimension_food_data, y_axis_all_zeros)\n",
"\n",
"for idx in range(len(single_dimension_food_data)):\n",
" plt.annotate(column_labels[idx], (single_dimension_food_data[idx] - 15, y_axis_all_zeros[idx]-0.011), rotation=-30)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Explaining the results"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For this we can plot the data"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 800x800 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(8,8))\n",
"\n",
"ax1.bar(range(len(row_labels)), food_data[:, 0])\n",
"ax1.set_title(column_labels[0])\n",
"ax1.set_xticks(range(len(row_labels)))\n",
"ax1.set_xticklabels(row_labels, rotation=90)\n",
"\n",
"ax2.bar(range(len(row_labels)), food_data[:, 1])\n",
"ax2.set_title(column_labels[1])\n",
"ax2.set_xticks(range(len(row_labels)))\n",
"ax2.set_xticklabels(row_labels, rotation=90)\n",
"\n",
"ax3.bar(range(len(row_labels)), food_data[:, 2])\n",
"ax3.set_title(column_labels[2])\n",
"ax3.set_xticks(range(len(row_labels)))\n",
"ax3.set_xticklabels(row_labels, rotation=90)\n",
"\n",
"ax4.bar(range(len(row_labels)), food_data[:, 3])\n",
"ax4.set_title(column_labels[3])\n",
"ax4.set_xticks(range(len(row_labels)))\n",
"ax4.set_xticklabels(row_labels, rotation=90)\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[['Fresh fruit' 'Alcoholic drinks' 'Fresh potatoes' 'Other meat'\n",
" 'Other Veg' 'Soft drinks' 'Fresh Veg' 'Fish' 'Cheese' 'Carcass meat'\n",
" 'Cereals' 'Sugars' 'Processed Veg' 'Confectionary' 'Processed potatoes'\n",
" 'Beverages' 'Fats and oils']]\n"
]
}
],
"source": [
"importance_idx = np.argsort(-abs(single_dimension_pca.components_))\n",
"# print row_labels in descending importance order\n",
"print(np.asarray(row_labels)[importance_idx])\n",
"# if interested, you could refer to the bar chart to verify"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Some reasons?\n",
"\n",
"- Northern Ireland eat way more grams of fresh potatoes and way fewer of fresh fruits, cheese, fish and alcoholic drinks\n",
"- It turns out that Northern Ireland is the only of the four countries not on the island of Great Britain"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Using PCA with 2 components instead of two"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 1200x800 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# similarly we define a PCA with two components\n",
"two_dimension_pca = PCA(n_components=2)\n",
"two_dimension_food_data = two_dimension_pca.fit_transform(food_data.T)\n",
"\n",
"# Notice that this is another way of plotting subplots\n",
"# ----------------------------------------------------\n",
"plt.figure(figsize=(12,8))\n",
"\n",
"plt.subplot(2,2,1) #upper left figure\n",
"plt.scatter(two_dimension_food_data[:,0], two_dimension_food_data[:,1])\n",
"for idx in range(len(two_dimension_food_data)):\n",
" plt.annotate(column_labels[idx], (two_dimension_food_data[:,0][idx], two_dimension_food_data[:,1][idx]), rotation=0)\n",
"plt.axis('equal')\n",
"plt.xlabel(\"PC1\")\n",
"plt.ylabel(\"PC2\")\n",
"\n",
"# note this is the first PC, and it is completely the same with the one with only one PC.\n",
"plt.subplot(2,2,3) #lower left figure\n",
"plt.scatter(two_dimension_food_data[:,0], y_axis_all_zeros)\n",
"for idx in range(len(two_dimension_food_data)):\n",
" plt.annotate(column_labels[idx], (two_dimension_food_data[:,0][idx], y_axis_all_zeros[idx]), rotation=90)\n",
"plt.axis('equal')\n",
"plt.xlabel(\"PC1\")\n",
"\n",
"plt.subplot(2,2,2) #upper right figure\n",
"plt.scatter(y_axis_all_zeros, two_dimension_food_data[:,1])\n",
"for idx in range(len(two_dimension_food_data)):\n",
" plt.annotate(column_labels[idx], (y_axis_all_zeros[idx], two_dimension_food_data[:,1][idx]), rotation=0)\n",
"plt.axis('equal')\n",
"plt.ylabel(\"PC2\")\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### PCA Results"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Data points for decomposition into 1 dimension:\n",
"\n",
"[[ 144.99315218]\n",
" [ 240.52914764]\n",
" [ 91.869339 ]\n",
" [-477.39163882]]\n",
"\n",
"\n",
"Data points for decomposition into 2 dimensions:\n",
"\n",
"[[ 144.99315218 2.53299944]\n",
" [ 240.52914764 224.64692488]\n",
" [ 91.869339 -286.08178613]\n",
" [-477.39163882 58.90186182]]\n"
]
}
],
"source": [
"print('Data points for decomposition into 1 dimension:\\n')\n",
"print(single_dimension_food_data)\n",
"print('\\n\\nData points for decomposition into 2 dimensions:\\n')\n",
"print(two_dimension_food_data)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The explained ratio for decomposition into 1 dimension is 0.6744434639658383\n",
"\n",
"The explained ratio for decomposition into 2 dimensions is 0.6744434639658383 and 0.2905247457687651\n"
]
}
],
"source": [
"print('The explained ratio for decomposition into 1 dimension is', single_dimension_pca.explained_variance_ratio_[0])\n",
"print('\\nThe explained ratio for decomposition into 2 dimensions is', two_dimension_pca.explained_variance_ratio_[0], \n",
" 'and', two_dimension_pca.explained_variance_ratio_[1])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"np.float64(0.9649682097346034)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"two_dimension_pca.explained_variance_ratio_[0] + two_dimension_pca.explained_variance_ratio_[1]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Differences among fit, transform, and fit_transform"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"When we fit the data before by doing `single_dimension_pca.fit_transform(food_data.T)` we actually runned two methods `fit()` and `transform()`. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Usually this is really helpfull when we create machine learning models because we can fit the model and then inject new data to be \"transformed\" or predicted. That is `fit()` fits the model to the data we sent as a parameter."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"one_dim_pca = PCA(n_components=1)\n",
"one_dim_pca_fitted_model = one_dim_pca.fit(food_data.T)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can look at results by using our original data"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 144.99315218]\n",
" [ 240.52914764]\n",
" [ 91.869339 ]\n",
" [-477.39163882]]\n"
]
}
],
"source": [
"one_dim_pca_results = one_dim_pca_fitted_model.transform(food_data.T)\n",
"print(one_dim_pca_results)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[144.99315218],\n",
" [240.52914764]])"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"one_dim_pca_fitted_model.transform([food_data[:, 0], food_data[:, 1]])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"However, we could actually plug in new data that we didn't fit within the PCA model (for example, if we collected the 17 measurements for Adelaide, we could use it with transform as well, etc)."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1463.14536953 484.06253401 1026.04095959 1423.06323832 613.8601646\n",
" 1230.06955441 622.08729612 468.08165388 1197.47234605 1119.54593613\n",
" 677.6311532 1164.93039436 985.30009739 1283.01217997 312.3646879\n",
" 1107.82607575 900.13233599]\n"
]
}
],
"source": [
"# Let's imagine this is the data for Adelaide\n",
"adelaide_data = np.random.uniform(low=100, high=1500, size=(17,))\n",
"print(adelaide_data)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The results for using our moodel with Adelaide's dataset is: 1391.0111600258065\n"
]
}
],
"source": [
"# Now let's see what are the results on this\n",
"print(\"The results for using our moodel with Adelaide's dataset is: \", \n",
" one_dim_pca_fitted_model.transform([adelaide_data])[0][0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Interpretation\n",
"\n",
"How do we interpret the low-dimensional representation? Why is North Ireland so far away from the other points? One way to try to answer this question is to first look at what features (i.e., what specific food/drink items) are being assigned high weight by PCA:"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 0.05695538 -0.04792763 0.25891666 0.08441498 0.00519362 0.03762098\n",
" -0.40140206 0.15184994 0.24359373 0.02688623 0.03648827 0.6326409\n",
" 0.04770286 0.02618776 -0.23224414 0.46396817 0.0296502 ]]\n",
"(1, 17)\n"
]
}
],
"source": [
"print(single_dimension_pca.components_) # index 0 is for the 1st principal component\n",
"print(np.shape(single_dimension_pca.components_))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 0.05695538 -0.04792763 0.25891666 0.08441498 0.00519362 0.03762098\n",
" -0.40140206 0.15184994 0.24359373 0.02688623 0.03648827 0.6326409\n",
" 0.04770286 0.02618776 -0.23224414 0.46396817 0.0296502 ]\n",
"[-0.01601285 -0.01391582 0.01533114 0.05075495 0.09538866 0.0430217\n",
" 0.71501708 0.14490027 0.22545092 -0.04285076 0.0454518 0.17774074\n",
" 0.21259968 0.03056054 -0.55512431 -0.11353652 -0.00594992]\n"
]
}
],
"source": [
"print(two_dimension_pca.components_[0])\n",
"print(two_dimension_pca.components_[1])"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Fresh fruit' 'Alcoholic drinks' 'Fresh potatoes' 'Other meat'\n",
" 'Other Veg' 'Soft drinks' 'Fresh Veg' 'Fish' 'Cheese' 'Carcass meat'\n",
" 'Cereals' 'Sugars' 'Processed Veg' 'Confectionary' 'Processed potatoes'\n",
" 'Beverages' 'Fats and oils']\n"
]
}
],
"source": [
"importance_idx = np.argsort(-abs(two_dimension_pca.components_[0]))\n",
"# print row_labels in descending importance order\n",
"print(np.asarray(row_labels)[importance_idx])\n",
"# if interested, you could refer to the bar chart to verify"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Importantly, how PCA (that has already been fitted) actually projects a data point to 1D is to take a weighted combination using the above weights (although it first subtracts off the feature means). Specifically, here are the calculations for England and Wales:"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Single dimension PCA means:\n",
" [ 94.25 245.25 706. 130.5 205.25 154.25 798.25 208. 457.75\n",
" 202. 349. 967.5 1502.5 57.5 1427. 360.75 55.25]\n",
"\n",
"Two dimensions PCA means:\n",
" [ 94.25 245.25 706. 130.5 205.25 154.25 798.25 208. 457.75\n",
" 202. 349. 967.5 1502.5 57.5 1427. 360.75 55.25]\n"
]
}
],
"source": [
"print('Single dimension PCA means:\\n', single_dimension_pca.mean_)\n",
"print('\\nTwo dimensions PCA means:\\n', two_dimension_pca.mean_)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"np.float64(144.99315218207673)"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.inner(single_dimension_pca.components_[0], food_data[:, 0] - single_dimension_pca.mean_)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"np.float64(240.52914763517674)"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.inner(single_dimension_pca.components_[0],\n",
" food_data[:, 1] - single_dimension_pca.mean_)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"np.float64(2.5329994370406084)"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.inner(two_dimension_pca.components_[1],\n",
" food_data[:, 0] - two_dimension_pca.mean_)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"np.float64(224.6469248812689)"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.inner(two_dimension_pca.components_[1],\n",
" food_data[:, 1] - two_dimension_pca.mean_)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Argsort"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In the previous lecture we saw the `sorted` function; now we introduce numpy's `argsort`, which does *not* return the sorted list but instead returns the rearranged indices that would sort the list (put another way, it returns rankings)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Going back to our previous example with the food data, in PCA, weights with larger absolute value correspond to features that lead to the largest spread along the projected 1D axis. Here's some code to rank the weights by largest absolute value to smallest absolute value:"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index Food Absolute Value\n",
"----- -------------------- ----------------------\n",
"11 Fresh fruit 0.6326408978722377 \n",
"15 Alcoholic drinks 0.4639681679767063 \n",
"6 Fresh potatoes -0.4014020602962481 \n",
"2 Other meat 0.25891665833612115 \n",
"8 Other Veg 0.24359372899027432 \n",
"14 Soft drinks -0.23224414047289454 \n",
"7 Fresh Veg 0.1518499415623022 \n",
"3 Fish 0.08441498252508357 \n",
"0 Cheese 0.05695537978568527 \n",
"1 Carcass meat -0.04792762813468528 \n",
"12 Cereals 0.04770285837364895 \n",
"5 Sugars 0.03762098283940196 \n",
"10 Processed Veg 0.03648826911159385 \n",
"16 Confectionary 0.029650201087993874 \n",
"9 Processed potatoes 0.026886232536746928 \n",
"13 Beverages 0.02618775590853346 \n",
"4 Fats and oils 0.005193622660047751 \n"
]
}
],
"source": [
"abs_1PC_weights = np.abs(single_dimension_pca.components_[0])\n",
"\n",
"ranking_abs_1PC_weights = np.argsort(-abs_1PC_weights) # use negative to get largest to smallest\n",
"\n",
"# Printing out the food items from highest to lowest absolute value weight\n",
"print(\"{0:5} {1:20} {2:10}\".format('Index', 'Food', 'Absolute Value'))\n",
"print(\"{0:5} {1:20} {2:22}\".format('-----', '--------------------', '----------------------'))\n",
"for index in ranking_abs_1PC_weights:\n",
" print(\"{0:5} {1:20} {2:22}\".format(str(index), row_labels[index], str(single_dimension_pca.components_[0][index])))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Using argsort with our example"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 144.99315218],\n",
" [ 240.52914764],\n",
" [ 91.869339 ],\n",
" [-477.39163882]])"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"single_dimension_food_data"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wales : 240.5291476351767\n",
"England : 144.9931521820767\n",
"Scotland : 91.86933899886354\n",
"N. Ireland : -477.39163881611705\n"
]
}
],
"source": [
"ranking_of_region_from_large_to_small_1st_component = \\\n",
"np.argsort(-(single_dimension_food_data[:,0] - np.average(single_dimension_food_data[:,0])))\n",
"\n",
"for index in ranking_of_region_from_large_to_small_1st_component:\n",
" print(column_labels[index], \":\", single_dimension_food_data[index,0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Using argsort with a dictionary"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"from collections import Counter\n",
"dict_fruits = {\"apple\":10, \"pear\":7, \"banana\":11, \"grape\":20, \"orange\":12}\n",
"stock = Counter(dict_fruits)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('grape', 20), ('orange', 12), ('banana', 11), ('apple', 10), ('pear', 7)]"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted(stock.items(), reverse=True, key = lambda x:x[1])"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([-1, -2])"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.dot(-1, [1,2])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Task:** Try to return a list in descending order based on the stock with argsort.\n",
"\n",
"Useful methods:\n",
"- Counter.keys()\n",
"- Counter.values()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['grape', 'orange', 'banana', 'apple', 'pear'], dtype='<U6')"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted_index = np.argsort(np.dot(-1, list(stock.values())))\n",
"\n",
"# another way to do it in desecending order\n",
"# sorted_index = np.argsort(list(stock.values()))[::-1]\n",
"\n",
"sorted_stock_keys = np.array(list(stock.keys()))\n",
"sorted_stock_keys[sorted_index]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Using argsort with matrices"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Suppose we have a list of fruits with their respective prices. These prices correspond to 4 states in the Australia.\n",
"\n",
"**Tasks:** \n",
"- Give a list of the fruits from the most expensive to the cheapest. This thinking that each row correspond to one state.\n",
"- Now, do the same, but now think that the states are actually the columns of the matrix."
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"fruits = np.array([['apple', 'banana', 'kiwi', 'passionfruit'], \n",
" ['mango', 'orange', 'mandarin', 'citrus'], \n",
" ['watermelon', 'rockmelon', 'papaya', 'grape'], \n",
" ['plum', 'peach', 'apricot', 'lychee']])\n",
"\n",
"fruit_prices = np.array([[5,3,12,1],\n",
" [12,5,3,9],\n",
" [2,6,1,19],\n",
" [1,5,4,14]])"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[3 0 2 0]\n",
" [2 1 1 1]\n",
" [0 3 3 3]\n",
" [1 2 0 2]]\n",
"\n",
"[[3 1 0 2]\n",
" [2 1 3 0]\n",
" [2 0 1 3]\n",
" [0 2 1 3]]\n"
]
}
],
"source": [
"#return index matrix sorting by column\n",
"print(np.argsort(fruit_prices, axis=0))\n",
"print()\n",
"\n",
"#return index matrix sorting by row\n",
"print(np.argsort(fruit_prices, axis=1))"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[3, 1, 0, 2],\n",
" [2, 1, 3, 0],\n",
" [2, 0, 1, 3],\n",
" [0, 2, 1, 3]])"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted_fruit_indices = np.argsort(fruit_prices, axis=1)\n",
"sorted_fruit_indices"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[array(['passionfruit', 'banana', 'apple', 'kiwi'], dtype='<U12'),\n",
" array(['mandarin', 'orange', 'citrus', 'mango'], dtype='<U12'),\n",
" array(['papaya', 'watermelon', 'rockmelon', 'grape'], dtype='<U12'),\n",
" array(['plum', 'apricot', 'peach', 'lychee'], dtype='<U12')]"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To show the results of this in terms of the labels you can do as follow\n",
"[fruit[sorted_fruit_indices[idx]] for idx, fruit in enumerate(fruits)]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([['passionfruit', 'banana', 'apple', 'kiwi'],\n",
" ['mandarin', 'orange', 'citrus', 'mango'],\n",
" ['papaya', 'watermelon', 'rockmelon', 'grape'],\n",
" ['plum', 'apricot', 'peach', 'lychee']], dtype='<U12')"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# You can also use \n",
"np.take_along_axis(fruits, sorted_fruit_indices, axis=1)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment