Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 94-775/95-865: Recitation 2\n",
"Author: Erick Rodriguez (erickger [at symbol] cmu.edu)\n",
"\n",
"This demo is based on Mark Richardson's 2009 \"Principle Component Analysis\" notes and uses data he pulled from DEFRA on 1997 UK food consumption (grams/person/week). This dataset is also used as a nice illustrated example of PCA here:\n",
"http://setosa.io/ev/principal-component-analysis/"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Creating the dataset"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"plt.style.use('seaborn') # prettier plots\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"# grams per person per week\n",
"food_data = np.array([[105, 103, 103, 66],\n",
" [245, 227, 242, 267],\n",
" [685, 803, 750, 586],\n",
" [147, 160, 122, 93],\n",
" [193, 235, 184, 209], \n",
" [156, 175, 147, 139],\n",
" [720, 874, 566, 1033],\n",
" [253, 265, 171, 143],\n",
" [488, 570, 418, 355],\n",
" [198, 203, 220, 187],\n",
" [360, 365, 337, 334],\n",
" [1102, 1137, 957, 674],\n",
" [1472, 1582, 1462, 1494],\n",
" [57, 73, 53, 47],\n",
" [1374, 1256, 1572, 1506],\n",
" [375, 475, 458, 135],\n",
" [54, 64, 62, 41]])\n",
"row_labels = ['Cheese',\n",
" 'Carcass meat',\n",
" 'Other meat',\n",
" 'Fish',\n",
" 'Fats and oils',\n",
" 'Sugars',\n",
" 'Fresh potatoes',\n",
" 'Fresh Veg',\n",
" 'Other Veg',\n",
" 'Processed potatoes',\n",
" 'Processed Veg',\n",
" 'Fresh fruit',\n",
" 'Cereals',\n",
" 'Beverages',\n",
" 'Soft drinks',\n",
" 'Alcoholic drinks',\n",
" 'Confectionary']\n",
"column_labels = ['England', 'Wales', 'Scotland', 'N. Ireland']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Looking at the table with a dataframe"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>England</th>\n",
" <th>Wales</th>\n",
" <th>Scotland</th>\n",
" <th>N. Ireland</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Cheese</th>\n",
" <td>105</td>\n",
" <td>103</td>\n",
" <td>103</td>\n",
" <td>66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Carcass meat</th>\n",
" <td>245</td>\n",
" <td>227</td>\n",
" <td>242</td>\n",
" <td>267</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Other meat</th>\n",
" <td>685</td>\n",
" <td>803</td>\n",
" <td>750</td>\n",
" <td>586</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Fish</th>\n",
" <td>147</td>\n",
" <td>160</td>\n",
" <td>122</td>\n",
" <td>93</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Fats and oils</th>\n",
" <td>193</td>\n",
" <td>235</td>\n",
" <td>184</td>\n",
" <td>209</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Sugars</th>\n",
" <td>156</td>\n",
" <td>175</td>\n",
" <td>147</td>\n",
" <td>139</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Fresh potatoes</th>\n",
" <td>720</td>\n",
" <td>874</td>\n",
" <td>566</td>\n",
" <td>1033</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Fresh Veg</th>\n",
" <td>253</td>\n",
" <td>265</td>\n",
" <td>171</td>\n",
" <td>143</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Other Veg</th>\n",
" <td>488</td>\n",
" <td>570</td>\n",
" <td>418</td>\n",
" <td>355</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Processed potatoes</th>\n",
" <td>198</td>\n",
" <td>203</td>\n",
" <td>220</td>\n",
" <td>187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Processed Veg</th>\n",
" <td>360</td>\n",
" <td>365</td>\n",
" <td>337</td>\n",
" <td>334</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Fresh fruit</th>\n",
" <td>1102</td>\n",
" <td>1137</td>\n",
" <td>957</td>\n",
" <td>674</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Cereals</th>\n",
" <td>1472</td>\n",
" <td>1582</td>\n",
" <td>1462</td>\n",
" <td>1494</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Beverages</th>\n",
" <td>57</td>\n",
" <td>73</td>\n",
" <td>53</td>\n",
" <td>47</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Soft drinks</th>\n",
" <td>1374</td>\n",
" <td>1256</td>\n",
" <td>1572</td>\n",
" <td>1506</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Alcoholic drinks</th>\n",
" <td>375</td>\n",
" <td>475</td>\n",
" <td>458</td>\n",
" <td>135</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Confectionary</th>\n",
" <td>54</td>\n",
" <td>64</td>\n",
" <td>62</td>\n",
" <td>41</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" England Wales Scotland N. Ireland\n",
"Cheese 105 103 103 66\n",
"Carcass meat 245 227 242 267\n",
"Other meat 685 803 750 586\n",
"Fish 147 160 122 93\n",
"Fats and oils 193 235 184 209\n",
"Sugars 156 175 147 139\n",
"Fresh potatoes 720 874 566 1033\n",
"Fresh Veg 253 265 171 143\n",
"Other Veg 488 570 418 355\n",
"Processed potatoes 198 203 220 187\n",
"Processed Veg 360 365 337 334\n",
"Fresh fruit 1102 1137 957 674\n",
"Cereals 1472 1582 1462 1494\n",
"Beverages 57 73 53 47\n",
"Soft drinks 1374 1256 1572 1506\n",
"Alcoholic drinks 375 475 458 135\n",
"Confectionary 54 64 62 41"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"food_df = pd.DataFrame(food_data, columns=column_labels, index=row_labels)\n",
"food_df.head(20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Running PCA"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 576x396 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.decomposition import PCA\n",
"\n",
"# instantiate PCA class\n",
"single_dimension_pca = PCA(n_components=1)\n",
"# use our pca to fit and transform the whole dataset\n",
"single_dimension_food_data = single_dimension_pca.fit_transform(food_data.T)\n",
"\n",
"# matplotlib doesn't have a built-in 1D scatter plot but we can\n",
"# just use a 2D scatter plot with y-axis values all set to 0\n",
"y_axis_all_zeros = np.zeros(len(single_dimension_food_data))\n",
"plt.scatter(single_dimension_food_data, y_axis_all_zeros)\n",
"\n",
"for idx in range(len(single_dimension_food_data)):\n",
" plt.annotate(column_labels[idx], (single_dimension_food_data[idx] - 15, y_axis_all_zeros[idx]-0.011), rotation=-30)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Explaining the results"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For this we can plot the data"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 576x576 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(8,8))\n",
"\n",
"ax1.bar(range(len(row_labels)), food_data[:, 0])\n",
"ax1.set_title(column_labels[0])\n",
"ax1.set_xticks(range(len(row_labels)))\n",
"ax1.set_xticklabels(row_labels, rotation=90)\n",
"\n",
"ax2.bar(range(len(row_labels)), food_data[:, 1])\n",
"ax2.set_title(column_labels[1])\n",
"ax2.set_xticks(range(len(row_labels)))\n",
"ax2.set_xticklabels(row_labels, rotation=90)\n",
"\n",
"ax3.bar(range(len(row_labels)), food_data[:, 2])\n",
"ax3.set_title(column_labels[2])\n",
"ax3.set_xticks(range(len(row_labels)))\n",
"ax3.set_xticklabels(row_labels, rotation=90)\n",
"\n",
"ax4.bar(range(len(row_labels)), food_data[:, 3])\n",
"ax4.set_title(column_labels[3])\n",
"ax4.set_xticks(range(len(row_labels)))\n",
"ax4.set_xticklabels(row_labels, rotation=90)\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Some reasons?\n",
"\n",
"- Northern Ireland eat way more grams of fresh potatoes and way fewer of fresh fruits, cheese, fish and alcoholic drinks\n",
"- It turns out that Northern Ireland is the only of the four countries not on the island of Great Britain"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Using PCA with 2 components instead of two"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 864x576 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# similarly we define a PCA with two components\n",
"two_dimension_pca = PCA(n_components=2)\n",
"two_dimension_food_data = two_dimension_pca.fit_transform(food_data.T)\n",
"\n",
"# Notice that this is another way of plotting subplots\n",
"# ----------------------------------------------------\n",
"plt.figure(figsize=(12,8))\n",
"\n",
"plt.subplot(2,2,1) #upper left figure\n",
"plt.scatter(two_dimension_food_data[:,0], two_dimension_food_data[:,1])\n",
"for idx in range(len(two_dimension_food_data)):\n",
" plt.annotate(column_labels[idx], (two_dimension_food_data[:,0][idx], two_dimension_food_data[:,1][idx]), rotation=0)\n",
"plt.xlabel(\"PCA 1\")\n",
"plt.ylabel(\"PCA 2\")\n",
"\n",
"# note this is the first PC, and it is completely the same with the one with only one PC.\n",
"plt.subplot(2,2,3) #lower left figure\n",
"plt.scatter(two_dimension_food_data[:,0], y_axis_all_zeros)\n",
"for idx in range(len(two_dimension_food_data)):\n",
" plt.annotate(column_labels[idx], (two_dimension_food_data[:,0][idx], y_axis_all_zeros[idx]), rotation=90)\n",
"plt.xlabel(\"PCA 1\")\n",
"\n",
"plt.subplot(2,2,2) #upper right figure\n",
"plt.scatter(y_axis_all_zeros, two_dimension_food_data[:,1])\n",
"for idx in range(len(two_dimension_food_data)):\n",
" plt.annotate(column_labels[idx], (y_axis_all_zeros[idx], two_dimension_food_data[:,1][idx]), rotation=0)\n",
"plt.ylabel(\"PCA 2\")\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### PCA Results"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Data points for decomposition into 1 dimension:\n",
"\n",
"[[-144.99315218]\n",
" [-240.52914764]\n",
" [ -91.869339 ]\n",
" [ 477.39163882]]\n",
"\n",
"\n",
"Data points for decomposition into 2 dimensions:\n",
"\n",
"[[-144.99315218 -2.53299944]\n",
" [-240.52914764 -224.64692488]\n",
" [ -91.869339 286.08178613]\n",
" [ 477.39163882 -58.90186182]]\n"
]
}
],
"source": [
"print('Data points for decomposition into 1 dimension:\\n')\n",
"print(single_dimension_food_data)\n",
"print('\\n\\nData points for decomposition into 2 dimensions:\\n')\n",
"print(two_dimension_food_data)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The explained ratio for decomposition into 1 dimension is 0.6744434639658381\n",
"\n",
"The explained ratio for decomposition into 2 dimensions is 0.6744434639658381 and 0.29052474576876536\n"
]
}
],
"source": [
"print('The explained ratio for decomposition into 1 dimension is', single_dimension_pca.explained_variance_ratio_[0])\n",
"print('\\nThe explained ratio for decomposition into 2 dimensions is', two_dimension_pca.explained_variance_ratio_[0], \n",
" 'and', two_dimension_pca.explained_variance_ratio_[1])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Differences among fit, transform, and fit_transform"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"When we fit the data before by doing `single_dimension_pca.fit_transform(food_data.T)` we actually runned two methods `fit()` and `transform()`. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Usually this is really helpfull when we create machine learning models because we can fit the model and then inject new data to be \"transformed\" or predicted. That is `fit()` fits the model to the data we sent as a parameter."
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"one_dim_pca = PCA(n_components=1)\n",
"one_dim_pca_fitted_model = single_dimension_pca.fit(food_data.T)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can look at results by using our original data"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[-144.99315218]\n",
" [-240.52914764]\n",
" [ -91.869339 ]\n",
" [ 477.39163882]]\n"
]
}
],
"source": [
"one_dim_pca_results = one_dim_pca_fitted_model.transform(food_data.T)\n",
"print(one_dim_pca_results)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[-144.99315218],\n",
" [-240.52914764]])"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"one_dim_pca_fitted_model.transform([food_data[:, 0], food_data[:, 1]])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"However, we could actually plug in new data that we didn't fit within the PCA model (for example, if we collected the 17 measurements for Adelaide, we could use it with transform as well, etc)."
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 988.7778723 127.25386502 383.61871694 215.68171475 312.97931132\n",
" 1078.52755231 1360.1017595 751.51907256 664.38757406 265.34368123\n",
" 1145.75372296 1281.77430384 426.36468158 439.01033511 1498.55667725\n",
" 930.49749995 132.4467529 ]\n"
]
}
],
"source": [
"# Let's imagine this is the data for Adelaide\n",
"adelaide_data = np.random.uniform(low=100, high=1500, size=(17,))\n",
"print(adelaide_data)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The results for using our moodel with Adelaide's dataset is: -361.26558393738446\n"
]
}
],
"source": [
"# Now let's see what are the results on this\n",
"print(\"The results for using our moodel with Adelaide's dataset is: \", \n",
" one_dim_pca_fitted_model.transform([adelaide_data])[0][0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Interpretation\n",
"\n",
"How do we interpret the low-dimensional representation? Why is North Ireland so far away from the other points? One way to try to answer this question is to first look at what features (i.e., what specific food/drink items) are being assigned high weight by PCA:"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[-0.05695538 0.04792763 -0.25891666 -0.08441498 -0.00519362 -0.03762098\n",
" 0.40140206 -0.15184994 -0.24359373 -0.02688623 -0.03648827 -0.6326409\n",
" -0.04770286 -0.02618776 0.23224414 -0.46396817 -0.0296502 ]]\n"
]
}
],
"source": [
"print(single_dimension_pca.components_) # index 0 is for the 1st principal component"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[-0.05695538 0.04792763 -0.25891666 -0.08441498 -0.00519362 -0.03762098\n",
" 0.40140206 -0.15184994 -0.24359373 -0.02688623 -0.03648827 -0.6326409\n",
" -0.04770286 -0.02618776 0.23224414 -0.46396817 -0.0296502 ]\n",
"[ 0.01601285 0.01391582 -0.01533114 -0.05075495 -0.09538866 -0.0430217\n",
" -0.71501708 -0.14490027 -0.22545092 0.04285076 -0.0454518 -0.17774074\n",
" -0.21259968 -0.03056054 0.55512431 0.11353652 0.00594992]\n"
]
}
],
"source": [
"print(two_dimension_pca.components_[0])\n",
"print(two_dimension_pca.components_[1])"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Fresh fruit' 'Alcoholic drinks' 'Fresh potatoes' 'Other meat'\n",
" 'Other Veg' 'Soft drinks' 'Fresh Veg' 'Fish' 'Cheese' 'Carcass meat'\n",
" 'Cereals' 'Sugars' 'Processed Veg' 'Confectionary' 'Processed potatoes'\n",
" 'Beverages' 'Fats and oils']\n"
]
}
],
"source": [
"importance_idx = np.argsort(-abs(two_dimension_pca.components_[0]))\n",
"# print row_labels in descending importance order\n",
"print(np.asarray(row_labels)[importance_idx])\n",
"# if interested, you could refer to the bar chart to verify"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Importantly, how PCA (that has already been fitted) actually projects a data point to 1D is to take a weighted combination using the above weights (although it first subtracts off the feature means). Specifically, here are the calculations for England and Wales:"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Single dimension PCA means:\n",
" [ 94.25 245.25 706. 130.5 205.25 154.25 798.25 208. 457.75\n",
" 202. 349. 967.5 1502.5 57.5 1427. 360.75 55.25]\n",
"\n",
"Two dimensions PCA means:\n",
" [ 94.25 245.25 706. 130.5 205.25 154.25 798.25 208. 457.75\n",
" 202. 349. 967.5 1502.5 57.5 1427. 360.75 55.25]\n"
]
}
],
"source": [
"print('Single dimension PCA means:\\n', single_dimension_pca.mean_)\n",
"print('\\nTwo dimensions PCA means:\\n', two_dimension_pca.mean_)"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"-144.99315218207676"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.inner(single_dimension_pca.components_[0], food_data[:, 0] - single_dimension_pca.mean_)"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"-240.52914763517666"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.inner(single_dimension_pca.components_[0],\n",
" food_data[:, 1] - single_dimension_pca.mean_)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"-2.532999437040636"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.inner(two_dimension_pca.components_[1],\n",
" food_data[:, 0] - two_dimension_pca.mean_)"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"-224.646924881269"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.inner(two_dimension_pca.components_[1],\n",
" food_data[:, 1] - two_dimension_pca.mean_)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Argsort"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In the previous lecture we saw the `sorted` function; now we introduce numpy's `argsort`, which does *not* return the sorted list but instead returns the rearranged indices that would sort the list (put another way, it returns rankings)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Going back to our previous exmaple with the food data, in PCA, weights with larger absolute value correspond to features that lead to the largest spread along the projected 1D axis. Here's some code to rank the weights by largest absolute value to smallest absolute value:"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index Food Absolute Value\n",
"----- -------------------- ----------------------\n",
"11 Fresh fruit -0.6326408978722377 \n",
"15 Alcoholic drinks -0.4639681679767063 \n",
"6 Fresh potatoes 0.40140206029624825 \n",
"2 Other meat -0.25891665833612104 \n",
"8 Other Veg -0.2435937289902743 \n",
"14 Soft drinks 0.2322441404728945 \n",
"7 Fresh Veg -0.15184994156230225 \n",
"3 Fish -0.08441498252508359 \n",
"0 Cheese -0.05695537978568534 \n",
"1 Carcass meat 0.04792762813468509 \n",
"12 Cereals -0.04770285837364884 \n",
"5 Sugars -0.03762098283940194 \n",
"10 Processed Veg -0.03648826911159385 \n",
"16 Confectionary -0.029650201087993867 \n",
"9 Processed potatoes -0.026886232536746928 \n",
"13 Beverages -0.026187755908533446 \n",
"4 Fats and oils -0.0051936226600476955\n"
]
}
],
"source": [
"abs_1PC_weights = np.abs(single_dimension_pca.components_[0])\n",
"\n",
"ranking_abs_1PC_weights = np.argsort(-abs_1PC_weights) # use negative to get largest to smallest\n",
"\n",
"# Printing out the food items from highest to lowest absolute value weight\n",
"print(\"{0:5} {1:20} {2:10}\".format('Index', 'Food', 'Absolute Value'))\n",
"print(\"{0:5} {1:20} {2:22}\".format('-----', '--------------------', '----------------------'))\n",
"for index in ranking_abs_1PC_weights:\n",
" print(\"{0:5} {1:20} {2:22}\".format(str(index), row_labels[index], str(single_dimension_pca.components_[0][index])))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Using argsort with our example"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[-144.99315218],\n",
" [-240.52914764],\n",
" [ -91.869339 ],\n",
" [ 477.39163882]])"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"single_dimension_food_data"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"N Ireland : 477.3916388161171\n",
"Scotland : -91.86933899886354\n",
"England : -144.9931521820767\n",
"Wales : -240.52914763517674\n"
]
}
],
"source": [
"ranking_of_region_from_large_to_small_1st_component = \\\n",
"np.argsort(-(single_dimension_food_data[:,0] - np.average(single_dimension_food_data[:,0])))\n",
"\n",
"for index in ranking_of_region_from_large_to_small_1st_component:\n",
" print(column_labels[index], \":\", single_dimension_food_data[index,0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Using argsort with a dictionary"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
"from collections import Counter\n",
"dict_fruits = {\"apple\":10, \"pear\":7, \"banana\":11, \"grape\":20, \"orange\":12}\n",
"stock = Counter(dict_fruits)"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('grape', 20), ('orange', 12), ('banana', 11), ('apple', 10), ('pear', 7)]"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted(stock.items(), reverse=True, key = lambda x:x[1])"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([-1, -2])"
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.dot(-1, [1,2])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Task:** Try to return a list in descending order based on the stock with argsort.\n",
"\n",
"Useful methods:\n",
"- Counter.keys()\n",
"- Counter.values()"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['grape', 'orange', 'banana', 'apple', 'pear'], dtype='<U6')"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted_index = np.argsort(np.dot(-1, list(stock.values())))\n",
"\n",
"# another way to do it in desecending order\n",
"# sorted_index = np.argsort(list(stock.values()))[::-1]\n",
"\n",
"sorted_stock_keys = np.array(list(stock.keys()))\n",
"sorted_stock_keys[sorted_index]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Using argsort with matrices"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Suppose we have a list of fruits with their respective prices. These prices correspond to 4 states in the Australia.\n",
"\n",
"**Tasks:** \n",
"- Give a list of the fruits from the most expensive to the cheapest. This thinking that each row correspond to one state.\n",
"- Now, do the same, but now think that the states are actually the columns of the matrix."
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
"fruits = np.array([['apple', 'banana', 'kiwi', 'passionfruit'], \n",
" ['mango', 'orange', 'mandarin', 'citrus'], \n",
" ['watermelon', 'rockmelon', 'papaya', 'grape'], \n",
" ['plum', 'peach', 'apricot', 'lychee']])\n",
"\n",
"fruit_prices = np.array([[5,3,12,1],\n",
" [12,5,3,9],\n",
" [2,6,1,19],\n",
" [1,5,4,14]])"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[3 0 2 0]\n",
" [2 1 1 1]\n",
" [0 3 3 3]\n",
" [1 2 0 2]]\n",
"\n",
"[[3 1 0 2]\n",
" [2 1 3 0]\n",
" [2 0 1 3]\n",
" [0 2 1 3]]\n"
]
}
],
"source": [
"#return index matrix sorting by column\n",
"print(np.argsort(fruit_prices, axis=0))\n",
"print()\n",
"\n",
"#return index matrix sorting by row\n",
"print(np.argsort(fruit_prices, axis=1))"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[3, 1, 0, 2],\n",
" [2, 1, 3, 0],\n",
" [2, 0, 1, 3],\n",
" [0, 2, 1, 3]])"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted_fruits = np.argsort(fruit_prices, axis=1)\n",
"sorted_fruits"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['passionfruit' 'banana' 'apple' 'kiwi']\n",
"['mandarin' 'orange' 'citrus' 'mango']\n",
"['papaya' 'watermelon' 'rockmelon' 'grape']\n",
"['plum' 'apricot' 'peach' 'lychee']\n"
]
},
{
"data": {
"text/plain": [
"[None, None, None, None]"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To show the results of this in terms of the labels you can do as follow\n",
"[print(fruit[sorted_fruits[idx]]) for idx,fruit in enumerate(fruits)]"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([['passionfruit', 'banana', 'apple', 'kiwi'],\n",
" ['mandarin', 'orange', 'citrus', 'mango'],\n",
" ['papaya', 'watermelon', 'rockmelon', 'grape'],\n",
" ['plum', 'apricot', 'peach', 'lychee']], dtype='<U12')"
]
},
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# You can also use \n",
"np.take_along_axis(fruits, sorted_fruits, axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Exercise2 [From demo Co-Occurrence Analysis for Finding Possible Relationships]**\n",
"\n",
"Output the rankings of each pair given the lists of strings for rows and columns"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[-0.99657 -4.23412 3.72022]\n",
" [-0.43363 0.06578 -0.62373]\n",
" [ 3.76277 -2.79671 -0.74926]]\n"
]
}
],
"source": [
"#Get the PMI table and lists of names and companies\n",
"np.set_printoptions(precision=5, suppress=True)\n",
"co_occurrence_table = np.array([[10, 15, 300],\n",
" [500, 10000, 500],\n",
" [200, 30, 10]])\n",
"joint_prob_table = co_occurrence_table / co_occurrence_table.sum()\n",
"\n",
"people_prob = joint_prob_table.sum(axis=1)\n",
"company_prob = joint_prob_table.sum(axis=0)\n",
"joint_prob_table_if_people_and_companies_were_indep = np.outer(people_prob, company_prob)\n",
"PMI = np.log2(joint_prob_table / joint_prob_table_if_people_and_companies_were_indep)\n",
"names = ['Elon Musk', 'Mark Zuckerberg', 'Tim Cook']\n",
"companies = ['Apple', 'Facebook', 'Tesla']\n",
"print(PMI)"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(3.7627680252977145, 'Tim Cook', 'Apple'),\n",
" (3.7202223303316297, 'Elon Musk', 'Tesla'),\n",
" (0.06578417815296361, 'Mark Zuckerberg', 'Facebook'),\n",
" (-0.43362918750578877, 'Mark Zuckerberg', 'Apple'),\n",
" (-0.6237320708857317, 'Mark Zuckerberg', 'Tesla'),\n",
" (-0.7492629529695907, 'Tim Cook', 'Tesla'),\n",
" (-0.996565381896946, 'Elon Musk', 'Apple'),\n",
" (-2.796712298097102, 'Tim Cook', 'Facebook'),\n",
" (-4.2341176104044, 'Elon Musk', 'Facebook')]"
]
},
"execution_count": 107,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Do it with sorted\n",
"PMI_name_company_tuples = [(PMI[row_idx, col_idx], names[row_idx], companies[col_idx]) for row_idx in range(PMI.shape[0]) for col_idx in range(PMI.shape[1])]\n",
"sorted(PMI_name_company_tuples, reverse=True) # without using itemgetter/lambda, sorts based on index 0"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Company Name PMI \n",
"-------------------- ---------- --------------------\n",
"Tim Cook Apple 3.7627680252977145 \n",
"Elon Musk Tesla 3.7202223303316297 \n",
"Mark Zuckerberg Facebook 0.06578417815296361 \n",
"Mark Zuckerberg Apple -0.43362918750578877\n",
"Mark Zuckerberg Tesla -0.6237320708857317 \n",
"Tim Cook Tesla -0.7492629529695907 \n",
"Elon Musk Apple -0.996565381896946 \n",
"Tim Cook Facebook -2.796712298097102 \n",
"Elon Musk Facebook -4.2341176104044 \n"
]
}
],
"source": [
"# do it with argsort\n",
"sorted_idx = np.argsort(PMI.flatten())\n",
"names_list = [t[1] for t in PMI_name_company_tuples]\n",
"companies_list = [t[2] for t in PMI_name_company_tuples]\n",
"sorted_idx = np.argsort(PMI.flatten())\n",
"print(\"{0:20} {1:10} {2:20}\".format('Company', 'Name', 'PMI'))\n",
"print(\"{0:20} {1:10} {2:20}\".format('--------------------', '----------', '--------------------'))\n",
"for idx in sorted_idx[::-1]:\n",
" print(\"{0:20} {1:10} {2:20}\".format(names_list[idx], companies_list[idx], str(PMI.flatten()[idx])))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment