TomAnthony/even_clusters.py

## even_clusters.py
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from fastcluster import linkage\n",
    "from scipy.spatial.distance import squareform\n",
    "\n",
    "distance_matrix = np.array([[0.0, .25, .25, .25],\n",
    "                            [.25, 0.0, .25, .25],\n",
    "                            [.25, .25, 0.0, .25],\n",
    "                            [.25, .25, .25, 0.0]])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.  ,  0.25,  0.25,  0.25],\n",
       "       [ 0.25,  0.  ,  0.25,  0.25],\n",
       "       [ 0.25,  0.25,  0.  ,  0.25],\n",
       "       [ 0.25,  0.25,  0.25,  0.  ]])"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "distance_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "compressed_distance_matrix = squareform(distance_matrix)\n",
    "Z = linkage(compressed_distance_matrix, method=\"complete\", metric=\"cosine\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.  ,  1.  ,  0.25,  2.  ],\n",
       "       [ 2.  ,  4.  ,  0.25,  3.  ],\n",
       "       [ 3.  ,  5.  ,  0.25,  4.  ]])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Z"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Num desired clusters\n",
    "k = 2\n",
    "\n",
    "# Count entries in the matrix\n",
    "initial_count = len(distance_matrix[0])\n",
    "\n",
    "# Fill an array with indices from the matrix\n",
    "c = [[i] for i in range(initial_count)]\n",
    "# Now extend with empty entries for merging into\n",
    "c.extend([[]]*(initial_count))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Loop the linkage matrix, making merges\n",
    "# into new clusters until we have k clusters.\n",
    "for i, block in enumerate(Z):\n",
    "    j = i + initial_count\n",
    "    a = int(block[0])\n",
    "    b = int(block[1])\n",
    "\n",
    "    # don't cluster randomly.\n",
    "    if block[2] >= 1:\n",
    "        break\n",
    "\n",
    "    # merge a and b into j\n",
    "    c[j] = c[a]\n",
    "    c[j].extend(c[b])\n",
    "    # empty a and b\n",
    "    c[a] = c[b] = []\n",
    "    \n",
    "    # count clusters and see if we have\n",
    "    # reached our target number\n",
    "    cluster_count = sum([min(len(x),1) for x in c])\n",
    "    if cluster_count <= k:\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[3], [2, 0, 1]]"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clusters = [x for x in c if x]\n",
    "clusters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# I have 2 clusters of size 1 and 3, but I'd like 2 and 2\n",
    "# More generally I want to cluster evenly where possible for larger arrays.\n",
    "# If  I have a small k it will then start combining those clusters (also evenly)."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"from fastcluster import linkage\n",
	"from scipy.spatial.distance import squareform\n",
	"\n",
	"distance_matrix = np.array([[0.0, .25, .25, .25],\n",
	" [.25, 0.0, .25, .25],\n",
	" [.25, .25, 0.0, .25],\n",
	" [.25, .25, .25, 0.0]])\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[ 0. , 0.25, 0.25, 0.25],\n",
	" [ 0.25, 0. , 0.25, 0.25],\n",
	" [ 0.25, 0.25, 0. , 0.25],\n",
	" [ 0.25, 0.25, 0.25, 0. ]])"
	]
	},
	"execution_count": 19,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"distance_matrix"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [],
	"source": [
	"compressed_distance_matrix = squareform(distance_matrix)\n",
	"Z = linkage(compressed_distance_matrix, method=\"complete\", metric=\"cosine\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[ 0. , 1. , 0.25, 2. ],\n",
	" [ 2. , 4. , 0.25, 3. ],\n",
	" [ 3. , 5. , 0.25, 4. ]])"
	]
	},
	"execution_count": 21,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"Z"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Num desired clusters\n",
	"k = 2\n",
	"\n",
	"# Count entries in the matrix\n",
	"initial_count = len(distance_matrix[0])\n",
	"\n",
	"# Fill an array with indices from the matrix\n",
	"c = [[i] for i in range(initial_count)]\n",
	"# Now extend with empty entries for merging into\n",
	"c.extend([[]]*(initial_count))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 38,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Loop the linkage matrix, making merges\n",
	"# into new clusters until we have k clusters.\n",
	"for i, block in enumerate(Z):\n",
	" j = i + initial_count\n",
	" a = int(block[0])\n",
	" b = int(block[1])\n",
	"\n",
	" # don't cluster randomly.\n",
	" if block[2] >= 1:\n",
	" break\n",
	"\n",
	" # merge a and b into j\n",
	" c[j] = c[a]\n",
	" c[j].extend(c[b])\n",
	" # empty a and b\n",
	" c[a] = c[b] = []\n",
	" \n",
	" # count clusters and see if we have\n",
	" # reached our target number\n",
	" cluster_count = sum([min(len(x),1) for x in c])\n",
	" if cluster_count <= k:\n",
	" break"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 37,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[[3], [2, 0, 1]]"
	]
	},
	"execution_count": 37,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"clusters = [x for x in c if x]\n",
	"clusters"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# I have 2 clusters of size 1 and 3, but I'd like 2 and 2\n",
	"# More generally I want to cluster evenly where possible for larger arrays.\n",
	"# If I have a small k it will then start combining those clusters (also evenly)."
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}