Skip to content

Instantly share code, notes, and snippets.

@DanielaLaura
Created January 7, 2021 21:46
Show Gist options
  • Save DanielaLaura/397bcdc68b20047b7e0ced7555e5f0d4 to your computer and use it in GitHub Desktop.
Save DanielaLaura/397bcdc68b20047b7e0ced7555e5f0d4 to your computer and use it in GitHub Desktop.
k-medoids python implementation
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import random\n",
"from sklearn.metrics.pairwise import pairwise_distances"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"column_names=['class','Alcohol','Malic acid','Ash','Alcalinity of ash','Magnesium',\n",
" 'Total phenols','Flavanoids','Nonflavanoid phenols','Proanthocyanins',\n",
" 'Color intensity','Hue','diluted wines','Proline']\n",
"\n",
"wine = pd.read_csv('wine.csv',names=column_names)\n",
" \n",
"wine_class = wine['class']\n",
"del wine['class']"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import MinMaxScaler\n",
"scaler = MinMaxScaler()\n",
"data = pd.DataFrame(scaler.fit_transform(wine), columns=wine.columns)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def euclideanDistance(x, y):\n",
" '''\n",
" Euclidean distance between x, y\n",
" --------\n",
" Return\n",
" d: float\n",
" '''\n",
" squared_d = 0\n",
" for i in range(len(x)):\n",
" squared_d += (x[i] - y[i])**2\n",
" d = np.sqrt(squared_d)\n",
" return d"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"D = pairwise_distances(data, metric='euclidean')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(178, 178)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"D.shape"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def kMedoids(D, k, tmax = 100):\n",
" # determine dimensions of distance matrix D\n",
" m, n = D.shape\n",
" if k > n:\n",
" raise Exception('too many medoids')\n",
" # find a set of valid initial cluster medoid indices since we\n",
" # can't seed different clusters with two points at the same location\n",
" valid_medoid_inds = set(range(n))\n",
" invalid_medoid_inds = set([])\n",
" rs,cs = np.where(D==0)\n",
" # the rows, cols must be shuffled because we will keep the first duplicate below\n",
" index_shuf = list(range(len(rs)))\n",
" np.random.shuffle(index_shuf)\n",
" rs = rs[index_shuf]\n",
" cs = cs[index_shuf]\n",
" for r,c in zip(rs,cs):\n",
" # if there are two points with a distance of 0...\n",
" # keep the first one for cluster init\n",
" if r < c and r not in invalid_medoid_inds:\n",
" invalid_medoid_inds.add(c)\n",
" valid_medoid_inds = list(valid_medoid_inds - invalid_medoid_inds)\n",
" if k > len(valid_medoid_inds):\n",
" raise Exception('too many medoids (after removing {} duplicate points)'.format(\n",
" len(invalid_medoid_inds)))\n",
"\n",
" # randomly initialize an array of k medoid indices\n",
" M = np.array(valid_medoid_inds)\n",
" np.random.shuffle(M)\n",
" M = np.sort(M[:k])\n",
"\n",
" # create a copy of the array of medoid indices\n",
" Mnew = np.copy(M)\n",
"\n",
" # initialize a dictionary to represent clusters\n",
" C = {}\n",
" for t in range(tmax):\n",
" # determine clusters, i. e. arrays of data indices\n",
" J = np.argmin(D[:,M], axis=1)\n",
" for kappa in range(k):\n",
" C[kappa] = np.where(J==kappa)[0]\n",
" # update cluster medoids\n",
" for kappa in range(k):\n",
" J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)\n",
" j = np.argmin(J)\n",
" Mnew[kappa] = C[kappa][j]\n",
" np.sort(Mnew)\n",
" # check for convergence\n",
" if np.array_equal(M, Mnew):\n",
" break\n",
" M = np.copy(Mnew)\n",
" else:\n",
" # final update of cluster memberships\n",
" J = np.argmin(D[:,M], axis=1)\n",
" for kappa in range(k):\n",
" C[kappa] = np.where(J==kappa)[0]\n",
"\n",
" # return results\n",
" return M, C"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"M, C = kMedoids(D, 3)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 35, 117, 148])"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"M"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{0: array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,\n",
" 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,\n",
" 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,\n",
" 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,\n",
" 52, 53, 54, 55, 56, 57, 58, 63, 66, 71, 73, 74, 81,\n",
" 98, 109, 121, 124], dtype=int64),\n",
" 1: array([ 59, 60, 62, 64, 65, 67, 68, 69, 70, 72, 75, 76, 77,\n",
" 78, 79, 80, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92,\n",
" 93, 94, 95, 96, 97, 99, 100, 101, 102, 103, 104, 105, 106,\n",
" 107, 108, 110, 111, 112, 113, 114, 115, 116, 117, 119, 120, 122,\n",
" 123, 125, 126, 127, 128, 129], dtype=int64),\n",
" 2: array([ 61, 83, 118, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,\n",
" 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152,\n",
" 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165,\n",
" 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177],\n",
" dtype=int64)}"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"C"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"medoids:\n",
"Alcohol 0.644737\n",
"Malic acid 0.211462\n",
"Ash 0.561497\n",
"Alcalinity of ash 0.510309\n",
"Magnesium 0.326087\n",
"Total phenols 0.593103\n",
"Flavanoids 0.556962\n",
"Nonflavanoid phenols 0.245283\n",
"Proanthocyanins 0.457413\n",
"Color intensity 0.325939\n",
"Hue 0.455285\n",
"diluted wines 0.805861\n",
"Proline 0.457917\n",
"Name: 35, dtype: float64\n",
"Alcohol 0.365789\n",
"Malic acid 0.171937\n",
"Ash 0.443850\n",
"Alcalinity of ash 0.613402\n",
"Magnesium 0.413043\n",
"Total phenols 0.351724\n",
"Flavanoids 0.369198\n",
"Nonflavanoid phenols 0.396226\n",
"Proanthocyanins 0.378549\n",
"Color intensity 0.066553\n",
"Hue 0.471545\n",
"diluted wines 0.619048\n",
"Proline 0.047789\n",
"Name: 117, dtype: float64\n",
"Alcohol 0.602632\n",
"Malic acid 0.494071\n",
"Ash 0.545455\n",
"Alcalinity of ash 0.561856\n",
"Magnesium 0.239130\n",
"Total phenols 0.327586\n",
"Flavanoids 0.088608\n",
"Nonflavanoid phenols 0.603774\n",
"Proanthocyanins 0.264984\n",
"Color intensity 0.609215\n",
"Hue 0.056911\n",
"diluted wines 0.128205\n",
"Proline 0.265335\n",
"Name: 148, dtype: float64\n"
]
}
],
"source": [
"print('medoids:')\n",
"for point_idx in M:\n",
" print(data.iloc[point_idx] )"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...</td>\n",
" <td>[59, 60, 62, 64, 65, 67, 68, 69, 70, 72, 75, 7...</td>\n",
" <td>[61, 83, 118, 130, 131, 132, 133, 134, 135, 13...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0 \\\n",
"0 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... \n",
"\n",
" 1 \\\n",
"0 [59, 60, 62, 64, 65, 67, 68, 69, 70, 72, 75, 7... \n",
"\n",
" 2 \n",
"0 [61, 83, 118, 130, 131, 132, 133, 134, 135, 13... "
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame([C], columns=C.keys())\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment