Skip to content

Instantly share code, notes, and snippets.

@OnlyBelter
Created July 24, 2020 13:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save OnlyBelter/71d399309ebe6ca974c0bc0c9ad95611 to your computer and use it in GitHub Desktop.
Save OnlyBelter/71d399309ebe6ca974c0bc0c9ad95611 to your computer and use it in GitHub Desktop.
Do hierarchy clustering by scipy and split clusters by distance threshold
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"np.set_printoptions(suppress=True)\n",
"import scipy\n",
"from scipy.spatial import distance\n",
"from scipy.cluster import hierarchy\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"sns.set()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sepal_length</th>\n",
" <th>sepal_width</th>\n",
" <th>petal_length</th>\n",
" <th>petal_width</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5.1</td>\n",
" <td>3.5</td>\n",
" <td>1.4</td>\n",
" <td>0.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4.9</td>\n",
" <td>3.0</td>\n",
" <td>1.4</td>\n",
" <td>0.2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sepal_length sepal_width petal_length petal_width\n",
"0 5.1 3.5 1.4 0.2\n",
"1 4.9 3.0 1.4 0.2"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"iris = sns.load_dataset('iris').iloc[range(10), :]\n",
"species = iris.pop('species')\n",
"iris.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(10, 10)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.000000</td>\n",
" <td>0.995999</td>\n",
" <td>0.999974</td>\n",
" <td>0.998168</td>\n",
" <td>0.999347</td>\n",
" <td>0.999586</td>\n",
" <td>0.998811</td>\n",
" <td>0.999538</td>\n",
" <td>0.998077</td>\n",
" <td>0.996552</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.995999</td>\n",
" <td>1.000000</td>\n",
" <td>0.996607</td>\n",
" <td>0.997397</td>\n",
" <td>0.992233</td>\n",
" <td>0.993592</td>\n",
" <td>0.990721</td>\n",
" <td>0.997118</td>\n",
" <td>0.998546</td>\n",
" <td>0.999033</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 4 5 6 \\\n",
"0 1.000000 0.995999 0.999974 0.998168 0.999347 0.999586 0.998811 \n",
"1 0.995999 1.000000 0.996607 0.997397 0.992233 0.993592 0.990721 \n",
"\n",
" 7 8 9 \n",
"0 0.999538 0.998077 0.996552 \n",
"1 0.997118 0.998546 0.999033 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"iris_corr = iris.T.corr()\n",
"print(iris_corr.shape)\n",
"iris_corr.head(2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### distance.pdist\n",
"- Pairwise distances between observations in n-dimensional space."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0.01442885, 0.0009666 , 0.00625717, 0.00568494, 0.00320313,\n",
" 0.00792722, 0.00322728, 0.00768217, 0.01158714, 0.01377558,\n",
" 0.0098153 , 0.01727702, 0.01620735, 0.01825397, 0.01291293,\n",
" 0.00842762, 0.00425646, 0.00549239, 0.00658408, 0.0041295 ,\n",
" 0.00879105, 0.00241845, 0.00684741, 0.01085314, 0.01071777,\n",
" 0.00855472, 0.0124801 , 0.00364936, 0.00181753, 0.00603786,\n",
" 0.00272345, 0.00231928, 0.00858389, 0.01220441, 0.01515585,\n",
" 0.00496781, 0.00602494, 0.01013213, 0.0136038 , 0.01070284,\n",
" 0.0139248 , 0.01644851, 0.00508454, 0.00948891, 0.00459087])"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dis = distance.pdist(iris_corr, metric='euclidean') # 任意两列之间的距离\n",
"dis"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0. , 2. , 0.0009666 , 2. ],\n",
" [ 3. , 8. , 0.00181753, 2. ],\n",
" [ 4. , 6. , 0.00231928, 2. ],\n",
" [ 7. , 10. , 0.00281043, 3. ],\n",
" [ 5. , 12. , 0.0038345 , 3. ],\n",
" [ 1. , 9. , 0.00425646, 2. ],\n",
" [11. , 13. , 0.00574414, 5. ],\n",
" [14. , 16. , 0.00838076, 8. ],\n",
" [15. , 17. , 0.01187589, 10. ]])"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"row_linkage = hierarchy.linkage(dis, method='centroid') # 任意两个cluster之间的距离,以及这个cluster内部包含的样本的个数\n",
"row_linkage"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 9, 5, 4, 6, 3, 8, 7, 0, 2], dtype=int32)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hierarchy.leaves_list(row_linkage)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 设置p的值和truncate_mode来确定展示的树形结构的深度"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAD7CAYAAABnoJM0AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAcx0lEQVR4nO3df3BU1f3/8efmp2KigcxuQoOO6LfIZwiYaj5+aJRgZ5gsIlEmJaVDSlprQ0sVBG2+MCYNxUoRmyFULXxbrHagSScZqkkz4iaIw9exRIek/qAQnQlTBRPdbBKqSQiyyd7vHw777UrjJiHchZzX46+ce87dc97eNa+99+4NDsuyLERExFhRkV6AiIhEloJARMRwCgIREcMpCEREDKcgEBExnIJARMRwCgIREcPFRHoBY3HqVD+BgD2PPyQnJ9Dd3WfLXJGg+i5vE7m+iVwb2FtfVJSDyZOvGrb/sgyCQMCyLQjOzTeRqb7L20SubyLXBpdOfbo0JCJiuBEFQX19PYsWLSInJ4fKysrz+ltbW8nLy8PtdlNSUsLg4GBI//bt23n66aeD7ePHj1NQUMC9997LsmXLaG1tvcAyRERkrMIGgdfrpaKigqqqKmpra6murqatrS1kTHFxMWVlZTQ0NGBZFjU1NQD09vby6KOP8vzzz4eMLy0tpaioiLq6OtauXcv69evHsSQRERmNsEFw6NAh5s6dS1JSEpMmTcLtduPxeIL97e3tnDlzhoyMDADy8vKC/QcOHOD666/nvvvuC3nN/Px85s2bB8BNN93Exx9/PG4FiYjI6IQNgs7OTpxOZ7Dtcrnwer3D9judzmD/kiVLWLlyJdHR0SGvmZeXF9z21FNPsWDBggurQkRExizst4YCgQAOhyPYtiwrpB2ufziWZfHkk0/yzjvvsHv37lEtOjk5YVTjL5TTmWjrfHZTfZe3iVzfRK4NLp36wgZBamoqzc3NwbbP58PlcoX0+3y+YLurqyuk/z8ZHBxk/fr1eL1edu/eTWLi6P5jdHf32fa1K6czEZ+v15a5Dr7dzptHveEHjqPYuGj8Z4dsndNOqm98/c+sFO7MSLNlLjv/34sEO+uLinJ85QfosJeGsrKyaGpqoqenh4GBARobG8nOzg72p6WlER8fT0tLCwB1dXUh/f/J1q1b6evr47nnnht1CExkbx71cqJz4j5AI5e3E519tn9QEXuEPSNISUlh3bp1FBYW4vf7Wbp0KXPmzKGoqIg1a9Ywe/ZsysvLKS0tpa+vj1mzZlFYWDjs6/X09FBZWcm0adPIz88Pbq+rqxufii5z17kSWF9wi23z6VPX5c3O+rZW/t2WecR+I3qyODc3l9zc3JBtu3btCv48c+ZM9u7dO+z+q1evDv48ZcoUjh07Ntp1iojIRaIni0VEDKcgEBExnIJARMRwCgIREcMpCEREDKcgEBExnIJARMRwCgIREcMpCEREDKcgEBExnIJARMRwCgIREcMpCEREDKcgEBExnIJARMRwCgIREcMpCEREDKcgEBExnIJARMRwCgIREcMpCEREDKcgEBExnIJARMRwCgIREcMpCEREDKcgEBEx3IiCoL6+nkWLFpGTk0NlZeV5/a2treTl5eF2uykpKWFwcDCkf/v27Tz99NPB9meffcbKlSu56667KCgowOfzXWAZIiIyVmGDwOv1UlFRQVVVFbW1tVRXV9PW1hYypri4mLKyMhoaGrAsi5qaGgB6e3t59NFHef7550PGb9++nczMTF5++WXy8/PZvHnzOJYkIiKjETYIDh06xNy5c0lKSmLSpEm43W48Hk+wv729nTNnzpCRkQFAXl5esP/AgQNcf/313HfffSGvefDgQXJzcwFYvHgxr732Gn6/f9yKEhGRkQsbBJ2dnTidzmDb5XLh9XqH7Xc6ncH+JUuWsHLlSqKjo4d9zZiYGBISEujp6bmwSkREZExiwg0IBAI4HI5g27KskHa4/pGwLIuoqJHft05OThjV618opzPRlnli46Jtne8cu+ezm+obH5F4f+rY2SNsEKSmptLc3Bxs+3w+XC5XSP+/3+zt6uoK6f9PXC4XXV1dpKamMjg4SH9/P0lJSSNedHd3H4GANeLxF8LpTMTn67VlLv/ZIQDb5gN764sE1Td+7H5/6tiNn6gox1d+gA77MTwrK4umpiZ6enoYGBigsbGR7OzsYH9aWhrx8fG0tLQAUFdXF9L/n8yfP5/a2loA9u3bR2ZmJrGxsSMqSERExlfYIEhJSWHdunUUFhayZMkSFi9ezJw5cygqKuLIkSMAlJeXs2XLFhYuXMjp06cpLCz8ytd86KGHePvtt7n77rupqqqirKxsfKoREZFRc1iWZc81lnE0US8Nba38OwDrC26xZT7Q6fflbiK/P3Xsxs8FXxoSEZGJTUEgImI4BYGIiOEUBCIihlMQiIgYTkEgImI4BYGIiOEUBCIihlMQiIgYTkEgImI4BYGIiOEUBCIihlMQiIgYTkEgImI4BYGIiOEUBCIihlMQiIgYTkEgImI4BYGIiOEUBCIihlMQiIgYTkEgImK4mEgvQGSiOPh2O28e9do6Z2xcNP6zQ7bMdaKzF4CtlX+3ZT47awP4n1kp3JmRZtt8lxKdEYiMkzePejnR2RfpZVw017kSuc6VGOllXBQnOvtsD/FLic4IRMbRda4E1hfcYtt8TmciPl+vbfPZyc7a7DrLuVTpjEBExHAjCoL6+noWLVpETk4OlZWV5/W3traSl5eH2+2mpKSEwcFBADo6OigoKGDhwoWsWrWK/v5+AD799FOKioq45557WLp0Ka2treNYkoiIjEbYIPB6vVRUVFBVVUVtbS3V1dW0tbWFjCkuLqasrIyGhgYsy6KmpgaATZs2sXz5cjweD+np6ezYsQOA559/nhkzZvDXv/6Vn/70pzz22GMXoTQRERmJsEFw6NAh5s6dS1JSEpMmTcLtduPxeIL97e3tnDlzhoyMDADy8vLweDz4/X4OHz6M2+0O2Q4QCASCZwcDAwNcccUV416YiIiMTNibxZ2dnTidzmDb5XLx7rvvDtvvdDrxer2cOnWKhIQEYmJiQrYD/PCHP2TZsmXccccd9Pf389xzz41bQSIiMjphgyAQCOBwOIJty7JC2sP1f3kcEGz/8pe/pKCggMLCQt566y3WrVvHSy+9xFVXXTWiRScnJ4xo3HhxOu35ylxsXLSt851j93x20/G7fOnY2SNsEKSmptLc3Bxs+3w+XC5XSL/P5wu2u7q6cLlcTJkyhd7eXoaGhoiOjg7Z78CBA8H7At/4xjdITk7m+PHjzJkzZ0SL7u7uIxCwRlbhBbLzK2znHp6x8+uAE/nrh6DjdznTsRs/UVGOr/wAHfYeQVZWFk1NTfT09DAwMEBjYyPZ2dnB/rS0NOLj42lpaQGgrq6O7OxsYmNjyczMZN++fQDU1tYG95s5cyavvPIKAB988AGdnZ1Mnz597FWKiMiYhQ2ClJQU1q1bR2FhIUuWLGHx4sXMmTOHoqIijhw5AkB5eTlbtmxh4cKFnD59msLCQgA2btxITU0NixYtorm5mbVr1wLwxBNP8Je//IXFixfz8MMPs3XrVhITL41TJBER04zoyeLc3Fxyc3NDtu3atSv488yZM9m7d+95+6WlpbFnz57ztl9//fXs3r17tGsVEZGLQE8Wi4gYTkEgImI4BYGIiOEUBCIihlMQiIgYTkEgImI4BYGIiOEUBCIihlMQiIgYTkEgImI4BYGIiOEUBCIihlMQiIgYTkEgImI4BYGIiOEUBCIihlMQiIgYTkEgImI4BYGIiOEUBCIihlMQiIgYTkEgImI4BYGIiOEUBCIihlMQiIgYTkEgImK4EQVBfX09ixYtIicnh8rKyvP6W1tbycvLw+12U1JSwuDgIAAdHR0UFBSwcOFCVq1aRX9/PwB9fX088sgjLFmyhCVLlnD06NFxLElEREYjbBB4vV4qKiqoqqqitraW6upq2traQsYUFxdTVlZGQ0MDlmVRU1MDwKZNm1i+fDkej4f09HR27NgBwJYtW5g6dSq1tbU8/PDD/OIXvxj/ykREZETCBsGhQ4eYO3cuSUlJTJo0CbfbjcfjCfa3t7dz5swZMjIyAMjLy8Pj8eD3+zl8+DButztku2VZNDY2snLlSgCys7P51a9+dTFqExGREQgbBJ2dnTidzmDb5XLh9XqH7Xc6nXi9Xk6dOkVCQgIxMTEh27u7u4mLi6Oqqoply5ZRWFjI0NDQeNYkIiKjEBNuQCAQwOFwBNuWZYW0h+v/8jgAh8PB0NAQXV1dJCYmUl1dzd/+9jceeOABDhw4MOJFJycnjHjseHA6E22ZJzYu2tb5zrF7Prvp+F2+dOzsETYIUlNTaW5uDrZ9Ph8ulyuk3+fzBdtdXV24XC6mTJlCb28vQ0NDREdHB/ebPHkyMTExLF68GIDbb7+d06dP093dTXJy8ogW3d3dRyBgjbjIC+F0JuLz9doyl//sF2dGds0H9tYXCTp+ly8du/ETFeX4yg/QYS8NZWVl0dTURE9PDwMDAzQ2NpKdnR3sT0tLIz4+npaWFgDq6urIzs4mNjaWzMxM9u3bB0BtbS3Z2dnExcWRlZXFSy+9BMDbb7/NlVdeyeTJky+oUBERGZuwZwQpKSmsW7eOwsJC/H4/S5cuZc6cORQVFbFmzRpmz55NeXk5paWl9PX1MWvWLAoLCwHYuHEjGzZsYOfOnUydOpVt27YBsHnzZsrKyqiqqiImJoaKigqiovRIg4h84V//9yCfvHUY/9lBW+b7PPBfAJx8ssGW+QCGFtxJ9C3ftG2+r+KwLMueayzjaKJeGtpa+XcA1hfcYst8MLEvLYCO3+Xq5JNbONt+kri0ayO9lIvi85MnSLjxBlLXFtsyX7hLQ2HPCEREIuGq6dNt+0Vpt5NPbon0EkLoeoyIiOEUBCIihlMQiIgYTkEgImI4BYGIiOEUBCIihlMQiIgYTkEgImI4BYGIiOEUBCIihlMQiIgYTkEgImI4BYGIiOEUBCIihlMQiIgYTkEgImI4BYGIiOEUBCIihlMQiIgYTkEgImI4BYGIiOEUBCIihlMQiIgYLibSC7iUnW09SIfnMH7/oC3zDXX/FwCn6xtsmQ/gs4w74dpv2jafiFx6FARfYbCtCavnJI4p19oy39rprbbMc85Q9wn6jr5OrIJAxGgjCoL6+np27tzJ4OAg3//+9ykoKAjpb21tpaSkhP7+fjIzM9m0aRMxMTF0dHRQXFxMd3c306dPp7y8nKuuuiq43yeffMI999zDCy+8wLRp08a3snESlzKd2IXFkV7GRXG6fkuklyAil4Cw9wi8Xi8VFRVUVVVRW1tLdXU1bW1tIWOKi4spKyujoaEBy7KoqakBYNOmTSxfvhyPx0N6ejo7duwI7hMIBCgpKcHv949zSSIiMhphg+DQoUPMnTuXpKQkJk2ahNvtxuPxBPvb29s5c+YMGRkZAOTl5eHxePD7/Rw+fBi32x2y/Zxnn32WrKwsJk+ePN41iYjIKIQNgs7OTpxOZ7Dtcrnwer3D9judTrxeL6dOnSIhIYGYmJiQ7QD/+Mc/eOONN7jvvvvGrRARERmbsPcIAoEADocj2LYsK6Q9XP+XxwE4HA4GBgbYtGkTv/nNb4iKGtu3V5OTE8a032h1xJ4LsURb5rPbRK/vHLvqi42LtnW+cybi8fskbmK/Ny+1+sIGQWpqKs3NzcG2z+fD5XKF9Pt8vmC7q6sLl8vFlClT6O3tZWhoiOjo6OB+zc3NdHd3s2rVKuCLM4qVK1fyzDPPcMMNN4xo0d3dfQQC1oiLHCu/f5DY2Bh8vt6LPlckTPT6Xm9/g3d6juD3D9ky3z97vvjCQ0nDr22ZD+Bb/2suN1+dYdt8dvGfHSQ2buK+N+2uLyrK8ZUfoMN+JM/KyqKpqYmenh4GBgZobGwkOzs72J+WlkZ8fDwtLS0A1NXVkZ2dTWxsLJmZmezbtw+A2tpasrOzmTdvHq+++ip1dXXU1dXhcrn4/e9/P+IQEBmpZu/bfPCvj2yb78b//ogb/9u++T7q6+D1Dw/bNp9MXGHPCFJSUli3bh2FhYX4/X6WLl3KnDlzKCoqYs2aNcyePZvy8nJKS0vp6+tj1qxZFBYWArBx40Y2bNjAzp07mTp1Ktu2bbvoBYn8u+uTpvHA7KJIL+Oi2P73/xPpJcgEMaLnCHJzc8nNzQ3ZtmvXruDPM2fOZO/eveftl5aWxp49e77ytV999dWRLEFERC4S/a0hERHDKQhERAynIBARMZyCQETEcAoCERHDKQhERAynIBARMZyCQETEcAoCERHDKQhERAynIBARMZyCQETEcAoCERHDKQhERAynIBARMZyCQETEcAoCERHDKQhERAynIBARMZyCQETEcAoCERHDKQhERAynIBARMZyCQETEcAoCERHDKQhERAw3oiCor69n0aJF5OTkUFlZeV5/a2sreXl5uN1uSkpKGBwcBKCjo4OCggIWLlzIqlWr6O/vB+D48eMUFBRw7733smzZMlpbW8exJBERGY2wQeD1eqmoqKCqqora2lqqq6tpa2sLGVNcXExZWRkNDQ1YlkVNTQ0AmzZtYvny5Xg8HtLT09mxYwcApaWlFBUVUVdXx9q1a1m/fv1FKE1EREYibBAcOnSIuXPnkpSUxKRJk3C73Xg8nmB/e3s7Z86cISMjA4C8vDw8Hg9+v5/Dhw/jdrtDtgPk5+czb948AG666SY+/vjjcS9MRERGJibcgM7OTpxOZ7Dtcrl49913h+13Op14vV5OnTpFQkICMTExIdvhi1A456mnnmLBggWjWnRycsKoxo9VR+y5tSfaMp/dJnp9sbHRgOq7HH0SN7Hfm5dafWGDIBAI4HA4gm3LskLaw/V/eRxw3rgnn3ySd955h927d49q0d3dfQQC1qj2GQu/f5DY2Bh8vt6LPlckTPz6hoiNjVZ9lyH/2UFi4ybwe9Pm+qKiHF/5ATrspaHU1FR8Pl+w7fP5cLlcw/Z3dXXhcrmYMmUKvb29DA0Nnbff4OAgP/vZzzhy5Ai7d+8mMfHSSEUREROFDYKsrCyampro6elhYGCAxsZGsrOzg/1paWnEx8fT0tICQF1dHdnZ2cTGxpKZmcm+ffsAqK2tDe63detW+vr6eO655xQCIiIRFvbSUEpKCuvWraOwsBC/38/SpUuZM2cORUVFrFmzhtmzZ1NeXk5paSl9fX3MmjWLwsJCADZu3MiGDRvYuXMnU6dOZdu2bfT09FBZWcm0adPIz88PzlNXV3fxqhQRkWGFDQKA3NxccnNzQ7bt2rUr+PPMmTPZu3fvefulpaWxZ8+e87YfO3ZstOsUEZGLRE8Wi4gYTkEgImI4BYGIiOEUBCIihlMQiIgYTkEgImI4BYGIiOEUBCIihlMQiIgYTkEgImI4BYGIiOEUBCIihlMQiIgYTkEgImI4BYGIiOEUBCIihlMQiIgYTkEgImI4BYGIiOEUBCIihlMQiIgYTkEgImI4BYGIiOEUBCIihlMQiIgYTkEgImK4EQVBfX09ixYtIicnh8rKyvP6W1tbycvLw+12U1JSwuDgIAAdHR0UFBSwcOFCVq1aRX9/PwCfffYZK1eu5K677qKgoACfzzeOJYmIyGiEDQKv10tFRQVVVVXU1tZSXV1NW1tbyJji4mLKyspoaGjAsixqamoA2LRpE8uXL8fj8ZCens6OHTsA2L59O5mZmbz88svk5+ezefPmi1CaiIiMREy4AYcOHWLu3LkkJSUB4Ha78Xg8PPjggwC0t7dz5swZMjIyAMjLy+Opp54iPz+fw4cP89vf/ja4/Xvf+x7FxcUcPHgweGaxePFiHnvsMfx+P7GxsSNadFSUY/SVjkF0QhLRMTG2zWe3iV5f0hVXExsbrfouQ3GTk4iJnbjvTbvrCzdP2CDo7OzE6XQG2y6Xi3fffXfYfqfTidfr5dSpUyQkJBATExOy/cv7xMTEkJCQQE9PDykpKSMqavLkq0Y07oJ953/bM0+kTPD6iuf/ONJLuKgmcn3Jj07s9+alVl/YS0OBQACH4/+niWVZIe3h+r88Djiv/e/7REXpvrWISCSE/e2bmpoacjPX5/PhcrmG7e/q6sLlcjFlyhR6e3sZGho6bz+Xy0VXVxcAg4OD9Pf3By89iYiIvcIGQVZWFk1NTfT09DAwMEBjYyPZ2dnB/rS0NOLj42lpaQGgrq6O7OxsYmNjyczMZN++fQDU1tYG95s/fz61tbUA7Nu3j8zMzBHfHxARkfHlsCzLCjeovr6e3/3ud/j9fpYuXUpRURFFRUWsWbOG2bNn895771FaWkpfXx+zZs1iy5YtxMXF0d7ezoYNG+ju7mbq1Kls27aNa665hn/9619s2LCBkydPkpiYSHl5OdOmTbOjXhER+ZIRBYGIiExcukMrImI4BYGIiOEUBCIihlMQiIgYLuyTxSazLIsNGzYwY8YM7r///kgvZ1zt2bOHP/3pT1xxxRXceOONlJWVTahnOZ544gk8Hg/XXHMNANOnT2f79u0RXtWF+fWvf803v/lNuru7+cMf/oDD4eDKK6+kpKSE2bNn88orr/D+++/zwAMPRHqpY2JKfR988AF//vOfcTgcXHvttTz++OMkJydHtj5L/qO2tjZrxYoV1s0332w9++yzkV7OuGpqarLmzZtnffzxx5ZlWdaLL75orV69OsKrGl/f+c53rJaWlkgvY9y89dZb1o9//GPr+PHj1u233255vV7Lsizr4MGD1vz584PjfvCDH1jHjh2L0CrHzpT6jhw5Yn3rW9+yPvvsM8uyLOuJJ56wfv7znwfHRao+XRoaRmVlJfn5+SxcuDDSSxl3R48eJSsri9TUVABycnJ49dVXOXv2bIRXNj7Onj3LsWPHePbZZ8nNzWX16tV0dHREelkX5Omnn2bZsmXExcXx+OOPB5/ST09Pp6urK3jsli5dyjPPPBPJpY6JKfWlp6fT0NBAYmIin3/+OV6vN+RMPGL12R49l5n169dPuDOCw4cPW/Pnz7c++ugjy7Isa8+ePdaMGTOCn8IudydOnLB+9KMfWe+//74VCASsXbt2Wffee68VCAQivbQx+fTTT62bb77Z+vzzz0O2BwIB65FHHgk5mzt16pSVnp5uDQwM2L3MMTOxvv3791u33Xabdccdd1j//Oc/g9sjVZ/OCAyUmZnJAw88wIMPPkheXh4Oh4OkpKQJ82c+rr32Wnbt2sWMGTNwOBzcf//9nDhxgo8++ijSSxuTDz/8EKfTSVxcXHDb6dOneeihhzhx4gSPP/54cHtSUhLx8fG0t7dHYqljYmJ9CxYs4M0332T16tXcf//9BAIBIHL1KQgM1NfXx2233caLL77ICy+8wIIFCwAmzM3i9957L/i3rM6xLOuyDTqHwxH8RQFf/Mt/3/3ud4mOjmb37t1cffXVIeOjo6OJjo62e5ljZlJ9H374Ic3NzcG+b3/723R0dPDpp58Gt0WiPgWBgTo7O1mxYgV9fX0A7Ny5k7vvvnvYPxN+uYmKimLz5s2cPHkSgKqqKm666abgPZHLzXXXXUd3dzeff/45fX19rFixgpycHCoqKrjiiitCxvb29nL27Fm+9rWvRWi1o2dSfT6fj4cffpienh7gi7/j9vWvf53JkycDkatPXx810A033MDKlSvJz88nEAhw6623UlZWFulljZsZM2ZQWlrKqlWrGBoaIjU1lW3btkV6WWN29dVXc+utt/LGG2/w3nvv0dHRwf79+9m/f39wzB//+EcmT57M66+/zp133hlyGeJSZ1J98+fP5yc/+QmFhYVER0fjcrmC/4ojELn6bL0jISJj0tLSYhUVFYUdt2LFCqu1tdWGFY0v1feFSNWnS0Mil4FbbrmF6dOn89prrw07Zv/+/WRmZjJz5kwbVzY+VF9k69OfoRYRMZzOCEREDKcgEBExnIJARMRwCgIREcMpCEREDKcgEBEx3P8Du1yNicTj3dwAAAAASUVORK5CYII=\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"dgram = hierarchy.dendrogram(row_linkage, p=2, truncate_mode='level')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 设置color threshold,确定用颜色区分类的个数"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"dgram = hierarchy.dendrogram(row_linkage, color_threshold=0.005)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'icoord': [[5.0, 5.0, 15.0, 15.0],\n",
" [35.0, 35.0, 45.0, 45.0],\n",
" [25.0, 25.0, 40.0, 40.0],\n",
" [55.0, 55.0, 65.0, 65.0],\n",
" [85.0, 85.0, 95.0, 95.0],\n",
" [75.0, 75.0, 90.0, 90.0],\n",
" [60.0, 60.0, 82.5, 82.5],\n",
" [32.5, 32.5, 71.25, 71.25],\n",
" [10.0, 10.0, 51.875, 51.875]],\n",
" 'dcoord': [[0.0, 0.004256456009326175, 0.004256456009326175, 0.0],\n",
" [0.0, 0.0023192825225930593, 0.0023192825225930593, 0.0],\n",
" [0.0, 0.003834496320219779, 0.003834496320219779, 0.0023192825225930593],\n",
" [0.0, 0.001817534593237589, 0.001817534593237589, 0.0],\n",
" [0.0, 0.0009665986618332519, 0.0009665986618332519, 0.0],\n",
" [0.0, 0.0028104314844498227, 0.0028104314844498227, 0.0009665986618332519],\n",
" [0.001817534593237589,\n",
" 0.005744138760901555,\n",
" 0.005744138760901555,\n",
" 0.0028104314844498227],\n",
" [0.003834496320219779,\n",
" 0.008380764348347213,\n",
" 0.008380764348347213,\n",
" 0.005744138760901555],\n",
" [0.004256456009326175,\n",
" 0.011875889940371007,\n",
" 0.011875889940371007,\n",
" 0.008380764348347213]],\n",
" 'ivl': ['1', '9', '5', '4', '6', '3', '8', '7', '0', '2'],\n",
" 'leaves': [1, 9, 5, 4, 6, 3, 8, 7, 0, 2],\n",
" 'color_list': ['C1', 'C2', 'C2', 'C3', 'C4', 'C4', 'C0', 'C0', 'C0']}"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dgram"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def get_node_id(node):\n",
" assert type(node) == scipy.cluster.hierarchy.ClusterNode\n",
" if node.count == 1:\n",
" node_id = []\n",
" node_id.append(node.id)\n",
" # node_id.append(right_node.id)\n",
" return node_id\n",
" else:\n",
" left_node = node.left\n",
" right_node = node.right\n",
" return get_node_id(left_node) + get_node_id(right_node)\n",
"\n",
"def split_tree_by_distance(node, threshold):\n",
" assert type(node) == scipy.cluster.hierarchy.ClusterNode\n",
" if node.count < 2:\n",
" return node\n",
" else:\n",
" if node.dist <= threshold:\n",
" return [node]\n",
" else:\n",
" return split_tree_by_distance(node.left, threshold) + split_tree_by_distance(node.right, threshold)\n",
"\n",
"def cut_cluster_by_distance(linkage_matrix, threshold):\n",
" \"\"\"\n",
" linkage_matrix: the matrix comes from hierarchy.linkage, \n",
" https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage\n",
" threshold: a float, max distance of clusters in the tree at the cut point\n",
" \"\"\"\n",
" node_id2subcluster = {}\n",
" rootnode = hierarchy.to_tree(linkage_matrix)\n",
" subcluster_rootnodes = split_tree_by_distance(rootnode, threshold=threshold)\n",
" for i, sub_rootnodes in enumerate(subcluster_rootnodes):\n",
" subcluster_nodes = get_node_id(sub_rootnodes)\n",
" for _node_id in subcluster_nodes:\n",
" node_id2subcluster[_node_id] = i\n",
" return node_id2subcluster"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{1: 0, 9: 0, 5: 1, 4: 1, 6: 1, 3: 2, 8: 2, 7: 3, 0: 3, 2: 3}"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cut_cluster_by_distance(row_linkage, 0.005)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment