Skip to content

Instantly share code, notes, and snippets.

@zhuang-hao-ming
Last active February 13, 2022 04:23
Show Gist options
  • Save zhuang-hao-ming/ca8ac0c1d622e18897bbb4876948dd60 to your computer and use it in GitHub Desktop.
Save zhuang-hao-ming/ca8ac0c1d622e18897bbb4876948dd60 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import make_blobs\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import numpy as np\n",
"import random"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"centers = [(0, 4), (5, 5), (8, 2)]\n",
"cluster_std = [1.2, 1, 1.1]\n",
"X, Y = make_blobs(n_samples=200, cluster_std=cluster_std, centers=centers, n_features=2, random_state=1)\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0.5, 1.0, 'scatter data')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.scatter(X[Y==0, 0], X[Y==0, 1], s=10, label='cluster1')\n",
"plt.scatter(X[Y==1, 0], X[Y==1, 1], s=10, label='cluster2')\n",
"plt.scatter(X[Y==2, 0], X[Y==2, 1], s=10, label='cluster3')\n",
"plt.title('scatter data')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def get_neigh(eps, df, index):\n",
" \n",
" x, y = df.iloc[index]['x'], df.iloc[index]['y']\n",
" \n",
"# con1 = np.power(x - df['X'], 2) + np.power(y - df['Y'], 2) <= eps ** 2\n",
"\n",
" con1 = np.abs(x - df['x']) <= eps # 这样写比半径好,为什么?\n",
" con2 = np.abs(y - df['y']) <= eps\n",
" con3 = df.index != index\n",
" \n",
" temp = df[con1 & con2 & con3] # 取得半径内,不为index的多有点\n",
" \n",
" return temp.index"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def grow_cluster(df, visited, labels, idx, C, eps, min_pnts):\n",
" \n",
" '''\n",
" 从idx开始增长一个cluster\n",
" \n",
" \n",
" 1. 把核心节点标记为一个新的簇\n",
" 2. 让核心节点入队\n",
" \n",
" '''\n",
" \n",
" queue = []\n",
" \n",
" labels[idx] = C\n",
" visited[idx] = True\n",
" \n",
" queue.append(idx)\n",
" \n",
" while len(queue) != 0:\n",
" \n",
" cur_idx = queue.pop(0) # 从队列开头弹出一个节点\n",
" \n",
" neigh_indexes = get_neigh(eps, df, cur_idx) # 获得它的邻居数目\n",
" \n",
" if len(neigh_indexes) < min_pnts: # 如果邻居数目少于阈值,它不是核心节点,无法影响邻居\n",
" continue\n",
" \n",
" for neigh_idx in neigh_indexes: # 否则对于所有邻居进行处理\n",
" \n",
" if labels[neigh_idx] == -1: # 如果邻居是异常,修正它,因为它是异常,不可能再影响别的点\n",
" labels[neigh_idx] = C\n",
" continue\n",
" \n",
" if not visited[neigh_idx]: # 对于邻居中未处理的点,把它纳入簇中\n",
" labels[neigh_idx] = C\n",
" visited[neigh_idx] = True\n",
" queue.append(neigh_idx)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def cluster_with_stack(eps, min_pnts, df):\n",
" \n",
" visited = [False for i in range(len(df))]\n",
" labels = [0 for i in range(len(df))]\n",
" \n",
" C = 0\n",
" \n",
" for idx, row in df.iterrows():\n",
" if visited[idx]:\n",
" continue\n",
" \n",
" neigh_indexes = get_neigh(eps, df, idx)\n",
" if len(neigh_indexes) < min_pnts: # 标记为异常\n",
" labels[idx] = -1\n",
" visited[idx] = True\n",
" else: \n",
" C += 1\n",
" grow_cluster(df, visited, labels, idx, C, eps, min_pnts)\n",
" \n",
" \n",
" return list(zip(df.index,labels))\n",
"\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.legend.Legend at 0x254dd697f98>"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 720x504 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"eps = 0.6\n",
"min_pnts = 3\n",
"\n",
"data = pd.DataFrame(X, columns=['x', 'y'])\n",
"\n",
"cluster_result = cluster_with_stack(eps, min_pnts, data)\n",
"idx, cluster = list(zip(*cluster_result))\n",
"cluster_df = pd.DataFrame(cluster_result, columns=['idx', 'cluster'])\n",
"\n",
"\n",
"plt.figure(figsize=(10,7))\n",
"for clust in np.unique(cluster):\n",
" cluster_idx = cluster_df[cluster_df['cluster'] == clust]['idx'].values\n",
" plt.scatter(X[cluster_idx, 0],\n",
" X[cluster_idx, 1],\n",
" s=10,\n",
" label=f'cluster{clust}'\n",
" )\n",
" \n",
"plt.legend([f'cluster{clust}' for clust in np.unique(cluster)], loc='lower right')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.12"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment