Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Linkage - Even Clustering?
{
"cells": [
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from fastcluster import linkage\n",
"from scipy.spatial.distance import squareform\n",
"\n",
"distance_matrix = np.array([[0.0, .25, .25, .25],\n",
" [.25, 0.0, .25, .25],\n",
" [.25, .25, 0.0, .25],\n",
" [.25, .25, .25, 0.0]])\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0. , 0.25, 0.25, 0.25],\n",
" [ 0.25, 0. , 0.25, 0.25],\n",
" [ 0.25, 0.25, 0. , 0.25],\n",
" [ 0.25, 0.25, 0.25, 0. ]])"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"distance_matrix"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"compressed_distance_matrix = squareform(distance_matrix)\n",
"Z = linkage(compressed_distance_matrix, method=\"complete\", metric=\"cosine\")"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0. , 1. , 0.25, 2. ],\n",
" [ 2. , 4. , 0.25, 3. ],\n",
" [ 3. , 5. , 0.25, 4. ]])"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Z"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"# Num desired clusters\n",
"k = 2\n",
"\n",
"# Count entries in the matrix\n",
"initial_count = len(distance_matrix[0])\n",
"\n",
"# Fill an array with indices from the matrix\n",
"c = [[i] for i in range(initial_count)]\n",
"# Now extend with empty entries for merging into\n",
"c.extend([[]]*(initial_count))"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Loop the linkage matrix, making merges\n",
"# into new clusters until we have k clusters.\n",
"for i, block in enumerate(Z):\n",
" j = i + initial_count\n",
" a = int(block[0])\n",
" b = int(block[1])\n",
"\n",
" # don't cluster randomly.\n",
" if block[2] >= 1:\n",
" break\n",
"\n",
" # merge a and b into j\n",
" c[j] = c[a]\n",
" c[j].extend(c[b])\n",
" # empty a and b\n",
" c[a] = c[b] = []\n",
" \n",
" # count clusters and see if we have\n",
" # reached our target number\n",
" cluster_count = sum([min(len(x),1) for x in c])\n",
" if cluster_count <= k:\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[[3], [2, 0, 1]]"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clusters = [x for x in c if x]\n",
"clusters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# I have 2 clusters of size 1 and 3, but I'd like 2 and 2\n",
"# More generally I want to cluster evenly where possible for larger arrays.\n",
"# If I have a small k it will then start combining those clusters (also evenly)."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment