Skip to content

Instantly share code, notes, and snippets.

@DanielaLaura
Created January 7, 2021 21:38
Show Gist options
  • Save DanielaLaura/c3a30f0e1cdc3f5252acd7898c206a66 to your computer and use it in GitHub Desktop.
Save DanielaLaura/c3a30f0e1cdc3f5252acd7898c206a66 to your computer and use it in GitHub Desktop.
k-medoids steps
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 130,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {},
"outputs": [],
"source": [
"column_names=['class','Alcohol','Malic acid','Ash','Alcalinity of ash','Magnesium',\n",
" 'Total phenols','Flavanoids','Nonflavanoid phenols','Proanthocyanins',\n",
" 'Color intensity','Hue','diluted wines','Proline']\n",
"\n",
"wine = pd.read_csv('wine.csv',names=column_names)\n",
" \n",
"wine_class = wine['class']\n",
"del wine['class']"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import MinMaxScaler\n",
"scaler = MinMaxScaler()\n",
"data = pd.DataFrame(scaler.fit_transform(wine), columns=wine.columns)"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {},
"outputs": [],
"source": [
"m = data.shape[0]\n",
"n = data.shape[1]\n",
"n_iter = 50"
]
},
{
"cell_type": "code",
"execution_count": 258,
"metadata": {},
"outputs": [],
"source": [
"K= 3\n",
"import random\n",
"#create an empty centroid array\n",
"centroids = np.array([]).reshape(n,0)\n",
"#create 5 random centroids\n",
"for k in range(K):\n",
" centroids = np.c_[centroids, data.iloc[random.randint(0,m-1)]]"
]
},
{
"cell_type": "code",
"execution_count": 259,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0.61315789, 0.43684211, 0.75526316],\n",
" [0.35968379, 0.15612648, 0.18577075],\n",
" [0.52941176, 0.48128342, 0.40641711],\n",
" [0.48453608, 0.52061856, 0.27835052],\n",
" [0.20652174, 0.10869565, 0.33695652],\n",
" [0.14482759, 0.13793103, 0.73103448],\n",
" [0.03375527, 0.23628692, 0.64345992],\n",
" [0.45283019, 0.8490566 , 0.1509434 ],\n",
" [0.07255521, 0.38170347, 0.54574132],\n",
" [0.36860068, 0.15102389, 0.4112628 ],\n",
" [0.17886179, 0.3902439 , 0.3495935 ],\n",
" [0.43956044, 0.28937729, 0.75457875],\n",
" [0.35805991, 0.15477889, 0.5042796 ]])"
]
},
"execution_count": 259,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"centroids"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 260,
"metadata": {},
"outputs": [],
"source": [
"def initMedoids( X,k):\n",
" ''' \n",
" Parameters\n",
" ----------\n",
" X: input data. \n",
" '''\n",
" medoids = []\n",
" \n",
" #Starting medoids will be random members from data set X\n",
" indexes = np.random.randint(0, len(X)-1,k)\n",
" medoids = X.iloc[indexes]\n",
" return medoids"
]
},
{
"cell_type": "code",
"execution_count": 261,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Alcohol</th>\n",
" <th>Malic acid</th>\n",
" <th>Ash</th>\n",
" <th>Alcalinity of ash</th>\n",
" <th>Magnesium</th>\n",
" <th>Total phenols</th>\n",
" <th>Flavanoids</th>\n",
" <th>Nonflavanoid phenols</th>\n",
" <th>Proanthocyanins</th>\n",
" <th>Color intensity</th>\n",
" <th>Hue</th>\n",
" <th>diluted wines</th>\n",
" <th>Proline</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>0.671053</td>\n",
" <td>0.181818</td>\n",
" <td>0.534759</td>\n",
" <td>0.438144</td>\n",
" <td>0.391304</td>\n",
" <td>0.648276</td>\n",
" <td>0.601266</td>\n",
" <td>0.169811</td>\n",
" <td>0.485804</td>\n",
" <td>0.479522</td>\n",
" <td>0.495935</td>\n",
" <td>0.589744</td>\n",
" <td>0.882311</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106</th>\n",
" <td>0.321053</td>\n",
" <td>0.195652</td>\n",
" <td>0.406417</td>\n",
" <td>0.432990</td>\n",
" <td>0.108696</td>\n",
" <td>0.231034</td>\n",
" <td>0.356540</td>\n",
" <td>0.452830</td>\n",
" <td>0.384858</td>\n",
" <td>0.180887</td>\n",
" <td>0.422764</td>\n",
" <td>0.695971</td>\n",
" <td>0.165478</td>\n",
" </tr>\n",
" <tr>\n",
" <th>47</th>\n",
" <td>0.755263</td>\n",
" <td>0.185771</td>\n",
" <td>0.406417</td>\n",
" <td>0.278351</td>\n",
" <td>0.336957</td>\n",
" <td>0.731034</td>\n",
" <td>0.643460</td>\n",
" <td>0.150943</td>\n",
" <td>0.545741</td>\n",
" <td>0.411263</td>\n",
" <td>0.349593</td>\n",
" <td>0.754579</td>\n",
" <td>0.504280</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Alcohol Malic acid Ash Alcalinity of ash Magnesium \\\n",
"31 0.671053 0.181818 0.534759 0.438144 0.391304 \n",
"106 0.321053 0.195652 0.406417 0.432990 0.108696 \n",
"47 0.755263 0.185771 0.406417 0.278351 0.336957 \n",
"\n",
" Total phenols Flavanoids Nonflavanoid phenols Proanthocyanins \\\n",
"31 0.648276 0.601266 0.169811 0.485804 \n",
"106 0.231034 0.356540 0.452830 0.384858 \n",
"47 0.731034 0.643460 0.150943 0.545741 \n",
"\n",
" Color intensity Hue diluted wines Proline \n",
"31 0.479522 0.495935 0.589744 0.882311 \n",
"106 0.180887 0.422764 0.695971 0.165478 \n",
"47 0.411263 0.349593 0.754579 0.504280 "
]
},
"execution_count": 261,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pr = initMedoids( data,3)\n",
"pr"
]
},
{
"cell_type": "code",
"execution_count": 277,
"metadata": {},
"outputs": [],
"source": [
"#create an empty array\n",
"euclid = np.array([]).reshape(m,0)\n",
"#find distance betweeen centroids and each point\n",
"for k in range(K):\n",
" dist=np.sum((data-centroids[:,k])**2,axis=1)\n",
" euclid=np.c_[euclid,dist]\n",
"#store the minimum distance value computed\n",
"labels=np.argmin(euclid,axis=1)+1"
]
},
{
"cell_type": "code",
"execution_count": 263,
"metadata": {},
"outputs": [],
"source": [
"#euclid"
]
},
{
"cell_type": "code",
"execution_count": 264,
"metadata": {},
"outputs": [],
"source": [
"def compute_distance (X, medoids, k):\n",
"#create an empty array\n",
" euclid = np.array([]).reshape(m,0)\n",
"#find distance betweeen centroids and each point\n",
" for k in range(K):\n",
" dist=np.sum((X-medoids[:,k])**2,axis=1)\n",
" euclid=np.c_[euclid,dist]\n",
" #store the minimum distance value computed\n",
" labels=np.argmin(euclid,axis=1)+1\n",
" return labels"
]
},
{
"cell_type": "code",
"execution_count": 265,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n",
" 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n",
" 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 2, 2, 1, 3, 2, 2,\n",
" 3, 1, 2, 3, 2, 3, 1, 3, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 3, 2, 2, 2,\n",
" 2, 2, 2, 2, 2, 3, 3, 3, 1, 3, 3, 2, 3, 2, 1, 2, 3, 2, 2, 2, 2, 3,\n",
" 3, 1, 2, 2, 2, 2, 2, 2, 1, 2, 3, 3, 2, 3, 3, 2, 2, 2, 2, 1, 1, 1,\n",
" 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1], dtype=int64)"
]
},
"execution_count": 265,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"labels = compute_distance (data,centroids, 3)\n",
"labels"
]
},
{
"cell_type": "code",
"execution_count": 266,
"metadata": {},
"outputs": [],
"source": [
" def euclidian(param1, param2):\n",
" euclidian_distance = np.sqrt(np.sum((param1-param2)**2))\n",
" #euclidian_distance = np.sum(np.abs(param1-param2))\n",
" return euclidian_distance"
]
},
{
"cell_type": "code",
"execution_count": 267,
"metadata": {},
"outputs": [],
"source": [
"def updateMedoids( X, labels):\n",
" '''\n",
" Parameters\n",
" ----------\n",
" labels: a list contains labels of data points\n",
" '''\n",
" #self.has_converged = True\n",
" \n",
" #Store data points to the current cluster they belong to\n",
" k=3\n",
" medoids_cost = np.sum(euclid)\n",
" \n",
" clusters = []\n",
" for i in range(0,k):\n",
" cluster = []\n",
" for j in range(len(X)):\n",
" if (labels[j] == i):\n",
" cluster.append(X.loc[j])\n",
" clusters.append(cluster)\n",
" \n",
" #Calculate the new medoids\n",
" new_medoids = []\n",
" for i in range(0, k):\n",
" new_medoid = centroids[i]\n",
" old_medoids_cost = medoids_cost\n",
" for j in range(len(clusters[i])):\n",
" \n",
" #Cost of the current data points to be compared with the current optimal cost\n",
" cur_medoids_cost = 0\n",
" for dpoint_index in range(len(clusters[i])):\n",
" cur_medoids_cost += np.sum(euclidian(clusters[i][j], clusters[i][dpoint_index]))\n",
" \n",
" #If current cost is less than current optimal cost,\n",
" #make the current data point new medoid of the cluster\n",
" if cur_medoids_cost < old_medoids_cost:\n",
" new_medoid = clusters[i][j]\n",
" old_medoids_cost = cur_medoids_cost\n",
" \n",
" #Now we have the optimal medoid of the current cluster\n",
" new_medoids.append(new_medoid)\n",
" return new_medoids"
]
},
{
"cell_type": "code",
"execution_count": 268,
"metadata": {},
"outputs": [],
"source": [
"update = updateMedoids( data, labels)"
]
},
{
"cell_type": "code",
"execution_count": 233,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment