Skip to content

Instantly share code, notes, and snippets.

@Vini2
Last active October 20, 2023 20:06
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Vini2/0dc28a1d8b7d78cf30b3d633cd62c271 to your computer and use it in GitHub Desktop.
Save Vini2/0dc28a1d8b7d78cf30b3d633cd62c271 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Clustering using convex hulls - high dimensional data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imports"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import seaborn as sns\n",
"\n",
"from sklearn.datasets import make_blobs\n",
"from mpl_toolkits.mplot3d import Axes3D\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.cluster import KMeans\n",
"from itertools import permutations\n",
"from scipy.spatial import ConvexHull\n",
"from quadprog import solve_qp\n",
"from sklearn.metrics import accuracy_score"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create dataset"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Creating dataset \n",
"centers = [[0.8, 0.5, 0.7, 0.6, 0.5, 0.5, 0.3, 0.3], \n",
" [0.2, 0.2, 0.2, 0.1, 0.1, 0.2, 0.3, 0.2], \n",
" [0.1, -0.1, -0.1, -0.2, -0.2, -0.1, 0, 0.1]]\n",
"stds = [0.4, 0.3, 0.4] \n",
"\n",
"X, labels_true = make_blobs(n_samples=1000, centers=centers, cluster_std=stds, random_state=0) \n",
"point_indices = np.arange(1000)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get training and testing datasets"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"X_seeds, X_rest, y_seeds, y_rest, id_seeds, id_rest = train_test_split(X, labels_true, point_indices, test_size=0.33, random_state=42)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Remap clustering result to original labels"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def remap_labels(pred_labels, true_labels):\n",
" \"\"\"Rename prediction labels (clustered output) to best match true labels.\"\"\"\n",
" # from itertools import permutations # import this into script.\n",
" pred_labels, true_labels = np.array(pred_labels), np.array(true_labels)\n",
" assert pred_labels.ndim == 1 == true_labels.ndim\n",
" assert len(pred_labels) == len(true_labels)\n",
" cluster_names = np.unique(pred_labels)\n",
" accuracy = 0\n",
"\n",
" perms = np.array(list(permutations(np.unique(true_labels))))\n",
"\n",
" remapped_labels = true_labels\n",
" for perm in perms:\n",
" flipped_labels = np.zeros(len(true_labels))\n",
" for label_index, label in enumerate(cluster_names):\n",
" flipped_labels[pred_labels == label] = perm[label_index]\n",
"\n",
" testAcc = np.sum(flipped_labels == true_labels) / len(true_labels)\n",
" if testAcc > accuracy:\n",
" accuracy = testAcc\n",
" remapped_labels = flipped_labels\n",
"\n",
" return accuracy, remapped_labels"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get initial clustering using K-means"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.8641791044776119\n"
]
}
],
"source": [
"kmeans = KMeans(n_clusters=3, random_state=9).fit(X_seeds)\n",
"initial_result = kmeans.labels_\n",
"\n",
"intial_accuracy, remapped_initial_result = remap_labels(initial_result, y_seeds)\n",
"print(intial_accuracy)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get convex hulls of the initial clustering"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"dict_keys([2, 1, 0])\n"
]
}
],
"source": [
"# Get the idices of the data points belonging to each cluster\n",
"indices = {}\n",
"\n",
"for i in range(len(id_seeds)):\n",
" if int(remapped_initial_result[i]) not in indices:\n",
" indices[int(remapped_initial_result[i])] = [i]\n",
" else:\n",
" indices[int(remapped_initial_result[i])].append(i)\n",
"\n",
"# Get convex hulls for each cluster\n",
"hulls = {}\n",
"\n",
"for i in indices:\n",
" hull = ConvexHull(X_seeds[indices[i]])\n",
" hulls[i] = hull\n",
"\n",
"print(hulls.keys())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Assign remaining points to the cluster of the closest convex hull"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def point_in_hull(point, hull, tolerance=1e-12):\n",
" return all(\n",
" (np.dot(eq[:-1], point) + eq[-1] <= tolerance) for eq in hull.equations)\n",
"\n",
"def proj2hull(z, equations):\n",
" \"\"\"\n",
" Project `z` to the convex hull defined by the\n",
" hyperplane equations of the facets\n",
"\n",
" Arguments\n",
" z: array, shape (ndim,)\n",
" equations: array shape (nfacets, ndim + 1)\n",
"\n",
" Returns\n",
" x: array, shape (ndim,)\n",
" \"\"\"\n",
" G = np.eye(len(z), dtype=float)\n",
" a = np.array(z, dtype=float)\n",
" C = np.array(-equations[:, :-1], dtype=float)\n",
" b = np.array(equations[:, -1], dtype=float)\n",
" x, f, xu, itr, lag, act = solve_qp(G, a, C.T, b, meq=0, factorized=True)\n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"prediction = []\n",
"\n",
"for z1 in X_rest:\n",
" \n",
" min_cluster_distance = 100000\n",
" min_distance_point = \"\"\n",
" min_cluster_distance_hull = \"\"\n",
" \n",
" for i in indices:\n",
"\n",
" p = proj2hull(z1, hulls[i].equations)\n",
"\n",
" dist = np.linalg.norm(z1-p)\n",
"\n",
" if dist < min_cluster_distance:\n",
" min_cluster_distance = dist\n",
" min_distance_point = p\n",
" min_cluster_distance_hull = i\n",
" \n",
" prediction.append(min_cluster_distance_hull)\n",
"\n",
"prediction = np.array(prediction)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluate final result"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy of Convex Hull method: 0.867\n"
]
}
],
"source": [
"Y_pred = np.concatenate((remapped_initial_result, prediction))\n",
"Y_real = np.concatenate((y_seeds, y_rest))\n",
"\n",
"final_accuracy, remapped_final_result = remap_labels(Y_pred, Y_real)\n",
"\n",
"print(\"Accuracy of Convex Hull method:\", final_accuracy)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.867"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"accuracy_score(Y_real, Y_pred)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Run K-means on the entire dataset"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy of K-means method: 0.866\n"
]
}
],
"source": [
"kmeans = KMeans(n_clusters=3, random_state=9).fit(X)\n",
"result = kmeans.labels_\n",
"\n",
"accuracy, remapped_result = remap_labels(result, labels_true)\n",
"\n",
"print(\"Accuracy of K-means method:\", accuracy)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment