Vini2/Clustering using convex hulls - High Dim.ipynb

## Clustering using convex hulls - High Dim.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clustering using convex hulls - high dimensional data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "\n",
    "from sklearn.datasets import make_blobs\n",
    "from mpl_toolkits.mplot3d import Axes3D\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.cluster import KMeans\n",
    "from itertools import permutations\n",
    "from scipy.spatial import ConvexHull\n",
    "from quadprog import solve_qp\n",
    "from sklearn.metrics import accuracy_score"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Creating dataset \n",
    "centers = [[0.8, 0.5, 0.7, 0.6, 0.5, 0.5, 0.3, 0.3], \n",
    "           [0.2, 0.2, 0.2, 0.1, 0.1, 0.2, 0.3, 0.2], \n",
    "           [0.1, -0.1, -0.1, -0.2, -0.2, -0.1, 0, 0.1]]\n",
    "stds = [0.4, 0.3, 0.4]   \n",
    "\n",
    "X, labels_true = make_blobs(n_samples=1000, centers=centers, cluster_std=stds, random_state=0)        \n",
    "point_indices = np.arange(1000)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get training and testing datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_seeds, X_rest, y_seeds, y_rest, id_seeds, id_rest = train_test_split(X, labels_true, point_indices, test_size=0.33, random_state=42)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Remap clustering result to original labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def remap_labels(pred_labels, true_labels):\n",
    "    \"\"\"Rename prediction labels (clustered output) to best match true labels.\"\"\"\n",
    "    # from itertools import permutations # import this into script.\n",
    "    pred_labels, true_labels = np.array(pred_labels), np.array(true_labels)\n",
    "    assert pred_labels.ndim == 1 == true_labels.ndim\n",
    "    assert len(pred_labels) == len(true_labels)\n",
    "    cluster_names = np.unique(pred_labels)\n",
    "    accuracy = 0\n",
    "\n",
    "    perms = np.array(list(permutations(np.unique(true_labels))))\n",
    "\n",
    "    remapped_labels = true_labels\n",
    "    for perm in perms:\n",
    "        flipped_labels = np.zeros(len(true_labels))\n",
    "        for label_index, label in enumerate(cluster_names):\n",
    "            flipped_labels[pred_labels == label] = perm[label_index]\n",
    "\n",
    "        testAcc = np.sum(flipped_labels == true_labels) / len(true_labels)\n",
    "        if testAcc > accuracy:\n",
    "            accuracy = testAcc\n",
    "            remapped_labels = flipped_labels\n",
    "\n",
    "    return accuracy, remapped_labels"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get initial clustering using K-means"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8641791044776119\n"
     ]
    }
   ],
   "source": [
    "kmeans = KMeans(n_clusters=3, random_state=9).fit(X_seeds)\n",
    "initial_result = kmeans.labels_\n",
    "\n",
    "intial_accuracy, remapped_initial_result = remap_labels(initial_result, y_seeds)\n",
    "print(intial_accuracy)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get convex hulls of the initial clustering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys([2, 1, 0])\n"
     ]
    }
   ],
   "source": [
    "# Get the idices of the data points belonging to each cluster\n",
    "indices = {}\n",
    "\n",
    "for i in range(len(id_seeds)):\n",
    "    if int(remapped_initial_result[i]) not in indices:\n",
    "        indices[int(remapped_initial_result[i])] = [i]\n",
    "    else:\n",
    "        indices[int(remapped_initial_result[i])].append(i)\n",
    "\n",
    "# Get convex hulls for each cluster\n",
    "hulls = {}\n",
    "\n",
    "for i in indices:\n",
    "    hull = ConvexHull(X_seeds[indices[i]])\n",
    "    hulls[i] = hull\n",
    "\n",
    "print(hulls.keys())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Assign remaining points to the cluster of the closest convex hull"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def point_in_hull(point, hull, tolerance=1e-12):\n",
    "    return all(\n",
    "        (np.dot(eq[:-1], point) + eq[-1] <= tolerance) for eq in hull.equations)\n",
    "\n",
    "def proj2hull(z, equations):\n",
    "    \"\"\"\n",
    "    Project `z` to the convex hull defined by the\n",
    "    hyperplane equations of the facets\n",
    "\n",
    "    Arguments\n",
    "        z: array, shape (ndim,)\n",
    "        equations: array shape (nfacets, ndim + 1)\n",
    "\n",
    "    Returns\n",
    "        x: array, shape (ndim,)\n",
    "    \"\"\"\n",
    "    G = np.eye(len(z), dtype=float)\n",
    "    a = np.array(z, dtype=float)\n",
    "    C = np.array(-equations[:, :-1], dtype=float)\n",
    "    b = np.array(equations[:, -1], dtype=float)\n",
    "    x, f, xu, itr, lag, act = solve_qp(G, a, C.T, b, meq=0, factorized=True)\n",
    "    return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "prediction = []\n",
    "\n",
    "for z1 in X_rest:\n",
    "    \n",
    "    min_cluster_distance = 100000\n",
    "    min_distance_point = \"\"\n",
    "    min_cluster_distance_hull = \"\"\n",
    "    \n",
    "    for i in indices:\n",
    "\n",
    "        p = proj2hull(z1, hulls[i].equations)\n",
    "\n",
    "        dist = np.linalg.norm(z1-p)\n",
    "\n",
    "        if dist < min_cluster_distance:\n",
    "            min_cluster_distance = dist\n",
    "            min_distance_point = p\n",
    "            min_cluster_distance_hull = i\n",
    "            \n",
    "    prediction.append(min_cluster_distance_hull)\n",
    "\n",
    "prediction = np.array(prediction)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluate final result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy of Convex Hull method: 0.867\n"
     ]
    }
   ],
   "source": [
    "Y_pred = np.concatenate((remapped_initial_result, prediction))\n",
    "Y_real = np.concatenate((y_seeds, y_rest))\n",
    "\n",
    "final_accuracy, remapped_final_result = remap_labels(Y_pred, Y_real)\n",
    "\n",
    "print(\"Accuracy of Convex Hull method:\", final_accuracy)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.867"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "accuracy_score(Y_real, Y_pred)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Run K-means on the entire dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy of K-means method: 0.866\n"
     ]
    }
   ],
   "source": [
    "kmeans = KMeans(n_clusters=3, random_state=9).fit(X)\n",
    "result = kmeans.labels_\n",
    "\n",
    "accuracy, remapped_result = remap_labels(result, labels_true)\n",
    "\n",
    "print(\"Accuracy of K-means method:\", accuracy)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Clustering using convex hulls - high dimensional data"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Imports"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import matplotlib.pyplot as plt\n",
	"import numpy as np\n",
	"import seaborn as sns\n",
	"\n",
	"from sklearn.datasets import make_blobs\n",
	"from mpl_toolkits.mplot3d import Axes3D\n",
	"from sklearn.model_selection import train_test_split\n",
	"from sklearn.cluster import KMeans\n",
	"from itertools import permutations\n",
	"from scipy.spatial import ConvexHull\n",
	"from quadprog import solve_qp\n",
	"from sklearn.metrics import accuracy_score"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Create dataset"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Creating dataset \n",
	"centers = [[0.8, 0.5, 0.7, 0.6, 0.5, 0.5, 0.3, 0.3], \n",
	" [0.2, 0.2, 0.2, 0.1, 0.1, 0.2, 0.3, 0.2], \n",
	" [0.1, -0.1, -0.1, -0.2, -0.2, -0.1, 0, 0.1]]\n",
	"stds = [0.4, 0.3, 0.4] \n",
	"\n",
	"X, labels_true = make_blobs(n_samples=1000, centers=centers, cluster_std=stds, random_state=0) \n",
	"point_indices = np.arange(1000)\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Get training and testing datasets"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"X_seeds, X_rest, y_seeds, y_rest, id_seeds, id_rest = train_test_split(X, labels_true, point_indices, test_size=0.33, random_state=42)\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Remap clustering result to original labels"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"def remap_labels(pred_labels, true_labels):\n",
	" \"\"\"Rename prediction labels (clustered output) to best match true labels.\"\"\"\n",
	" # from itertools import permutations # import this into script.\n",
	" pred_labels, true_labels = np.array(pred_labels), np.array(true_labels)\n",
	" assert pred_labels.ndim == 1 == true_labels.ndim\n",
	" assert len(pred_labels) == len(true_labels)\n",
	" cluster_names = np.unique(pred_labels)\n",
	" accuracy = 0\n",
	"\n",
	" perms = np.array(list(permutations(np.unique(true_labels))))\n",
	"\n",
	" remapped_labels = true_labels\n",
	" for perm in perms:\n",
	" flipped_labels = np.zeros(len(true_labels))\n",
	" for label_index, label in enumerate(cluster_names):\n",
	" flipped_labels[pred_labels == label] = perm[label_index]\n",
	"\n",
	" testAcc = np.sum(flipped_labels == true_labels) / len(true_labels)\n",
	" if testAcc > accuracy:\n",
	" accuracy = testAcc\n",
	" remapped_labels = flipped_labels\n",
	"\n",
	" return accuracy, remapped_labels"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Get initial clustering using K-means"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0.8641791044776119\n"
	]
	}
	],
	"source": [
	"kmeans = KMeans(n_clusters=3, random_state=9).fit(X_seeds)\n",
	"initial_result = kmeans.labels_\n",
	"\n",
	"intial_accuracy, remapped_initial_result = remap_labels(initial_result, y_seeds)\n",
	"print(intial_accuracy)\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Get convex hulls of the initial clustering"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"dict_keys([2, 1, 0])\n"
	]
	}
	],
	"source": [
	"# Get the idices of the data points belonging to each cluster\n",
	"indices = {}\n",
	"\n",
	"for i in range(len(id_seeds)):\n",
	" if int(remapped_initial_result[i]) not in indices:\n",
	" indices[int(remapped_initial_result[i])] = [i]\n",
	" else:\n",
	" indices[int(remapped_initial_result[i])].append(i)\n",
	"\n",
	"# Get convex hulls for each cluster\n",
	"hulls = {}\n",
	"\n",
	"for i in indices:\n",
	" hull = ConvexHull(X_seeds[indices[i]])\n",
	" hulls[i] = hull\n",
	"\n",
	"print(hulls.keys())"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Assign remaining points to the cluster of the closest convex hull"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"def point_in_hull(point, hull, tolerance=1e-12):\n",
	" return all(\n",
	" (np.dot(eq[:-1], point) + eq[-1] <= tolerance) for eq in hull.equations)\n",
	"\n",
	"def proj2hull(z, equations):\n",
	" \"\"\"\n",
	" Project `z` to the convex hull defined by the\n",
	" hyperplane equations of the facets\n",
	"\n",
	" Arguments\n",
	" z: array, shape (ndim,)\n",
	" equations: array shape (nfacets, ndim + 1)\n",
	"\n",
	" Returns\n",
	" x: array, shape (ndim,)\n",
	" \"\"\"\n",
	" G = np.eye(len(z), dtype=float)\n",
	" a = np.array(z, dtype=float)\n",
	" C = np.array(-equations[:, :-1], dtype=float)\n",
	" b = np.array(equations[:, -1], dtype=float)\n",
	" x, f, xu, itr, lag, act = solve_qp(G, a, C.T, b, meq=0, factorized=True)\n",
	" return x"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"prediction = []\n",
	"\n",
	"for z1 in X_rest:\n",
	" \n",
	" min_cluster_distance = 100000\n",
	" min_distance_point = \"\"\n",
	" min_cluster_distance_hull = \"\"\n",
	" \n",
	" for i in indices:\n",
	"\n",
	" p = proj2hull(z1, hulls[i].equations)\n",
	"\n",
	" dist = np.linalg.norm(z1-p)\n",
	"\n",
	" if dist < min_cluster_distance:\n",
	" min_cluster_distance = dist\n",
	" min_distance_point = p\n",
	" min_cluster_distance_hull = i\n",
	" \n",
	" prediction.append(min_cluster_distance_hull)\n",
	"\n",
	"prediction = np.array(prediction)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Evaluate final result"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Accuracy of Convex Hull method: 0.867\n"
	]
	}
	],
	"source": [
	"Y_pred = np.concatenate((remapped_initial_result, prediction))\n",
	"Y_real = np.concatenate((y_seeds, y_rest))\n",
	"\n",
	"final_accuracy, remapped_final_result = remap_labels(Y_pred, Y_real)\n",
	"\n",
	"print(\"Accuracy of Convex Hull method:\", final_accuracy)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.867"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"accuracy_score(Y_real, Y_pred)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Run K-means on the entire dataset"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Accuracy of K-means method: 0.866\n"
	]
	}
	],
	"source": [
	"kmeans = KMeans(n_clusters=3, random_state=9).fit(X)\n",
	"result = kmeans.labels_\n",
	"\n",
	"accuracy, remapped_result = remap_labels(result, labels_true)\n",
	"\n",
	"print(\"Accuracy of K-means method:\", accuracy)\n"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}