DanielaLaura/k-medoids in python.ipynb

## k-medoids in python.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import random\n",
    "from sklearn.metrics.pairwise import pairwise_distances"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "column_names=['class','Alcohol','Malic acid','Ash','Alcalinity of ash','Magnesium',\n",
    "              'Total phenols','Flavanoids','Nonflavanoid phenols','Proanthocyanins',\n",
    "              'Color intensity','Hue','diluted wines','Proline']\n",
    "\n",
    "wine = pd.read_csv('wine.csv',names=column_names)\n",
    "    \n",
    "wine_class = wine['class']\n",
    "del wine['class']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import MinMaxScaler\n",
    "scaler = MinMaxScaler()\n",
    "data = pd.DataFrame(scaler.fit_transform(wine), columns=wine.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def euclideanDistance(x, y):\n",
    "    '''\n",
    "    Euclidean distance between x, y\n",
    "    --------\n",
    "    Return\n",
    "    d: float\n",
    "    '''\n",
    "    squared_d = 0\n",
    "    for i in range(len(x)):\n",
    "        squared_d += (x[i] - y[i])**2\n",
    "    d = np.sqrt(squared_d)\n",
    "    return d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "D = pairwise_distances(data, metric='euclidean')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(178, 178)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "D.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def kMedoids(D, k, tmax = 100):\n",
    "    # determine dimensions of distance matrix D\n",
    "    m, n = D.shape\n",
    "    if k > n:\n",
    "        raise Exception('too many medoids')\n",
    "    # find a set of valid initial cluster medoid indices since we\n",
    "    # can't seed different clusters with two points at the same location\n",
    "    valid_medoid_inds = set(range(n))\n",
    "    invalid_medoid_inds = set([])\n",
    "    rs,cs = np.where(D==0)\n",
    "    # the rows, cols must be shuffled because we will keep the first duplicate below\n",
    "    index_shuf = list(range(len(rs)))\n",
    "    np.random.shuffle(index_shuf)\n",
    "    rs = rs[index_shuf]\n",
    "    cs = cs[index_shuf]\n",
    "    for r,c in zip(rs,cs):\n",
    "        # if there are two points with a distance of 0...\n",
    "        # keep the first one for cluster init\n",
    "        if r < c and r not in invalid_medoid_inds:\n",
    "            invalid_medoid_inds.add(c)\n",
    "    valid_medoid_inds = list(valid_medoid_inds - invalid_medoid_inds)\n",
    "    if k > len(valid_medoid_inds):\n",
    "        raise Exception('too many medoids (after removing {} duplicate points)'.format(\n",
    "            len(invalid_medoid_inds)))\n",
    "\n",
    "    # randomly initialize an array of k medoid indices\n",
    "    M = np.array(valid_medoid_inds)\n",
    "    np.random.shuffle(M)\n",
    "    M = np.sort(M[:k])\n",
    "\n",
    "    # create a copy of the array of medoid indices\n",
    "    Mnew = np.copy(M)\n",
    "\n",
    "    # initialize a dictionary to represent clusters\n",
    "    C = {}\n",
    "    for t in range(tmax):\n",
    "        # determine clusters, i. e. arrays of data indices\n",
    "        J = np.argmin(D[:,M], axis=1)\n",
    "        for kappa in range(k):\n",
    "            C[kappa] = np.where(J==kappa)[0]\n",
    "        # update cluster medoids\n",
    "        for kappa in range(k):\n",
    "            J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)\n",
    "            j = np.argmin(J)\n",
    "            Mnew[kappa] = C[kappa][j]\n",
    "        np.sort(Mnew)\n",
    "        # check for convergence\n",
    "        if np.array_equal(M, Mnew):\n",
    "            break\n",
    "        M = np.copy(Mnew)\n",
    "    else:\n",
    "        # final update of cluster memberships\n",
    "        J = np.argmin(D[:,M], axis=1)\n",
    "        for kappa in range(k):\n",
    "            C[kappa] = np.where(J==kappa)[0]\n",
    "\n",
    "    # return results\n",
    "    return M, C"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "M, C = kMedoids(D, 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 35, 117, 148])"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "M"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{0: array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,\n",
       "         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,\n",
       "         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,\n",
       "         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,\n",
       "         52,  53,  54,  55,  56,  57,  58,  63,  66,  71,  73,  74,  81,\n",
       "         98, 109, 121, 124], dtype=int64),\n",
       " 1: array([ 59,  60,  62,  64,  65,  67,  68,  69,  70,  72,  75,  76,  77,\n",
       "         78,  79,  80,  82,  84,  85,  86,  87,  88,  89,  90,  91,  92,\n",
       "         93,  94,  95,  96,  97,  99, 100, 101, 102, 103, 104, 105, 106,\n",
       "        107, 108, 110, 111, 112, 113, 114, 115, 116, 117, 119, 120, 122,\n",
       "        123, 125, 126, 127, 128, 129], dtype=int64),\n",
       " 2: array([ 61,  83, 118, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,\n",
       "        140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152,\n",
       "        153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165,\n",
       "        166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177],\n",
       "       dtype=int64)}"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "C"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "medoids:\n",
      "Alcohol                 0.644737\n",
      "Malic acid              0.211462\n",
      "Ash                     0.561497\n",
      "Alcalinity of ash       0.510309\n",
      "Magnesium               0.326087\n",
      "Total phenols           0.593103\n",
      "Flavanoids              0.556962\n",
      "Nonflavanoid phenols    0.245283\n",
      "Proanthocyanins         0.457413\n",
      "Color intensity         0.325939\n",
      "Hue                     0.455285\n",
      "diluted wines           0.805861\n",
      "Proline                 0.457917\n",
      "Name: 35, dtype: float64\n",
      "Alcohol                 0.365789\n",
      "Malic acid              0.171937\n",
      "Ash                     0.443850\n",
      "Alcalinity of ash       0.613402\n",
      "Magnesium               0.413043\n",
      "Total phenols           0.351724\n",
      "Flavanoids              0.369198\n",
      "Nonflavanoid phenols    0.396226\n",
      "Proanthocyanins         0.378549\n",
      "Color intensity         0.066553\n",
      "Hue                     0.471545\n",
      "diluted wines           0.619048\n",
      "Proline                 0.047789\n",
      "Name: 117, dtype: float64\n",
      "Alcohol                 0.602632\n",
      "Malic acid              0.494071\n",
      "Ash                     0.545455\n",
      "Alcalinity of ash       0.561856\n",
      "Magnesium               0.239130\n",
      "Total phenols           0.327586\n",
      "Flavanoids              0.088608\n",
      "Nonflavanoid phenols    0.603774\n",
      "Proanthocyanins         0.264984\n",
      "Color intensity         0.609215\n",
      "Hue                     0.056911\n",
      "diluted wines           0.128205\n",
      "Proline                 0.265335\n",
      "Name: 148, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "print('medoids:')\n",
    "for point_idx in M:\n",
    "    print(data.iloc[point_idx] )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...</td>\n",
       "      <td>[59, 60, 62, 64, 65, 67, 68, 69, 70, 72, 75, 7...</td>\n",
       "      <td>[61, 83, 118, 130, 131, 132, 133, 134, 135, 13...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   0  \\\n",
       "0  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...   \n",
       "\n",
       "                                                   1  \\\n",
       "0  [59, 60, 62, 64, 65, 67, 68, 69, 70, 72, 75, 7...   \n",
       "\n",
       "                                                   2  \n",
       "0  [61, 83, 118, 130, 131, 132, 133, 134, 135, 13...  "
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame([C], columns=C.keys())\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import pandas as pd\n",
	"import random\n",
	"from sklearn.metrics.pairwise import pairwise_distances"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"column_names=['class','Alcohol','Malic acid','Ash','Alcalinity of ash','Magnesium',\n",
	" 'Total phenols','Flavanoids','Nonflavanoid phenols','Proanthocyanins',\n",
	" 'Color intensity','Hue','diluted wines','Proline']\n",
	"\n",
	"wine = pd.read_csv('wine.csv',names=column_names)\n",
	" \n",
	"wine_class = wine['class']\n",
	"del wine['class']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"from sklearn.preprocessing import MinMaxScaler\n",
	"scaler = MinMaxScaler()\n",
	"data = pd.DataFrame(scaler.fit_transform(wine), columns=wine.columns)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"def euclideanDistance(x, y):\n",
	" '''\n",
	" Euclidean distance between x, y\n",
	" --------\n",
	" Return\n",
	" d: float\n",
	" '''\n",
	" squared_d = 0\n",
	" for i in range(len(x)):\n",
	" squared_d += (x[i] - y[i])**2\n",
	" d = np.sqrt(squared_d)\n",
	" return d"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"D = pairwise_distances(data, metric='euclidean')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(178, 178)"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"D.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [],
	"source": [
	"def kMedoids(D, k, tmax = 100):\n",
	" # determine dimensions of distance matrix D\n",
	" m, n = D.shape\n",
	" if k > n:\n",
	" raise Exception('too many medoids')\n",
	" # find a set of valid initial cluster medoid indices since we\n",
	" # can't seed different clusters with two points at the same location\n",
	" valid_medoid_inds = set(range(n))\n",
	" invalid_medoid_inds = set([])\n",
	" rs,cs = np.where(D==0)\n",
	" # the rows, cols must be shuffled because we will keep the first duplicate below\n",
	" index_shuf = list(range(len(rs)))\n",
	" np.random.shuffle(index_shuf)\n",
	" rs = rs[index_shuf]\n",
	" cs = cs[index_shuf]\n",
	" for r,c in zip(rs,cs):\n",
	" # if there are two points with a distance of 0...\n",
	" # keep the first one for cluster init\n",
	" if r < c and r not in invalid_medoid_inds:\n",
	" invalid_medoid_inds.add(c)\n",
	" valid_medoid_inds = list(valid_medoid_inds - invalid_medoid_inds)\n",
	" if k > len(valid_medoid_inds):\n",
	" raise Exception('too many medoids (after removing {} duplicate points)'.format(\n",
	" len(invalid_medoid_inds)))\n",
	"\n",
	" # randomly initialize an array of k medoid indices\n",
	" M = np.array(valid_medoid_inds)\n",
	" np.random.shuffle(M)\n",
	" M = np.sort(M[:k])\n",
	"\n",
	" # create a copy of the array of medoid indices\n",
	" Mnew = np.copy(M)\n",
	"\n",
	" # initialize a dictionary to represent clusters\n",
	" C = {}\n",
	" for t in range(tmax):\n",
	" # determine clusters, i. e. arrays of data indices\n",
	" J = np.argmin(D[:,M], axis=1)\n",
	" for kappa in range(k):\n",
	" C[kappa] = np.where(J==kappa)[0]\n",
	" # update cluster medoids\n",
	" for kappa in range(k):\n",
	" J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)\n",
	" j = np.argmin(J)\n",
	" Mnew[kappa] = C[kappa][j]\n",
	" np.sort(Mnew)\n",
	" # check for convergence\n",
	" if np.array_equal(M, Mnew):\n",
	" break\n",
	" M = np.copy(Mnew)\n",
	" else:\n",
	" # final update of cluster memberships\n",
	" J = np.argmin(D[:,M], axis=1)\n",
	" for kappa in range(k):\n",
	" C[kappa] = np.where(J==kappa)[0]\n",
	"\n",
	" # return results\n",
	" return M, C"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [],
	"source": [
	"M, C = kMedoids(D, 3)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([ 35, 117, 148])"
	]
	},
	"execution_count": 13,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"M"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 39,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{0: array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,\n",
	" 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,\n",
	" 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,\n",
	" 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,\n",
	" 52, 53, 54, 55, 56, 57, 58, 63, 66, 71, 73, 74, 81,\n",
	" 98, 109, 121, 124], dtype=int64),\n",
	" 1: array([ 59, 60, 62, 64, 65, 67, 68, 69, 70, 72, 75, 76, 77,\n",
	" 78, 79, 80, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92,\n",
	" 93, 94, 95, 96, 97, 99, 100, 101, 102, 103, 104, 105, 106,\n",
	" 107, 108, 110, 111, 112, 113, 114, 115, 116, 117, 119, 120, 122,\n",
	" 123, 125, 126, 127, 128, 129], dtype=int64),\n",
	" 2: array([ 61, 83, 118, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,\n",
	" 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152,\n",
	" 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165,\n",
	" 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177],\n",
	" dtype=int64)}"
	]
	},
	"execution_count": 39,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"C"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"medoids:\n",
	"Alcohol 0.644737\n",
	"Malic acid 0.211462\n",
	"Ash 0.561497\n",
	"Alcalinity of ash 0.510309\n",
	"Magnesium 0.326087\n",
	"Total phenols 0.593103\n",
	"Flavanoids 0.556962\n",
	"Nonflavanoid phenols 0.245283\n",
	"Proanthocyanins 0.457413\n",
	"Color intensity 0.325939\n",
	"Hue 0.455285\n",
	"diluted wines 0.805861\n",
	"Proline 0.457917\n",
	"Name: 35, dtype: float64\n",
	"Alcohol 0.365789\n",
	"Malic acid 0.171937\n",
	"Ash 0.443850\n",
	"Alcalinity of ash 0.613402\n",
	"Magnesium 0.413043\n",
	"Total phenols 0.351724\n",
	"Flavanoids 0.369198\n",
	"Nonflavanoid phenols 0.396226\n",
	"Proanthocyanins 0.378549\n",
	"Color intensity 0.066553\n",
	"Hue 0.471545\n",
	"diluted wines 0.619048\n",
	"Proline 0.047789\n",
	"Name: 117, dtype: float64\n",
	"Alcohol 0.602632\n",
	"Malic acid 0.494071\n",
	"Ash 0.545455\n",
	"Alcalinity of ash 0.561856\n",
	"Magnesium 0.239130\n",
	"Total phenols 0.327586\n",
	"Flavanoids 0.088608\n",
	"Nonflavanoid phenols 0.603774\n",
	"Proanthocyanins 0.264984\n",
	"Color intensity 0.609215\n",
	"Hue 0.056911\n",
	"diluted wines 0.128205\n",
	"Proline 0.265335\n",
	"Name: 148, dtype: float64\n"
	]
	}
	],
	"source": [
	"print('medoids:')\n",
	"for point_idx in M:\n",
	" print(data.iloc[point_idx] )"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 38,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>0</th>\n",
	" <th>1</th>\n",
	" <th>2</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...</td>\n",
	" <td>[59, 60, 62, 64, 65, 67, 68, 69, 70, 72, 75, 7...</td>\n",
	" <td>[61, 83, 118, 130, 131, 132, 133, 134, 135, 13...</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" 0 \\\n",
	"0 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... \n",
	"\n",
	" 1 \\\n",
	"0 [59, 60, 62, 64, 65, 67, 68, 69, 70, 72, 75, 7... \n",
	"\n",
	" 2 \n",
	"0 [61, 83, 118, 130, 131, 132, 133, 134, 135, 13... "
	]
	},
	"execution_count": 38,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df = pd.DataFrame([C], columns=C.keys())\n",
	"df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}