Skip to content

Instantly share code, notes, and snippets.

@yasaichi
Last active August 31, 2022 12:18
Show Gist options
  • Star 37 You must be signed in to star a gist
  • Fork 11 You must be signed in to fork a gist
  • Save yasaichi/254a060eff56a3b3b858 to your computer and use it in GitHub Desktop.
Save yasaichi/254a060eff56a3b3b858 to your computer and use it in GitHub Desktop.
Implementation of X-means clustering in Python
"""
以下の論文で提案された改良x-means法の実装
クラスター数を自動決定するk-meansアルゴリズムの拡張について
http://www.rd.dnc.ac.jp/~tunenori/doc/xmeans_euc.pdf
"""
import numpy as np
from scipy import stats
from sklearn.cluster import KMeans
class XMeans:
"""
x-means法を行うクラス
"""
def __init__(self, k_init = 2, **k_means_args):
"""
k_init : The initial number of clusters applied to KMeans()
"""
self.k_init = k_init
self.k_means_args = k_means_args
def fit(self, X):
"""
x-means法を使ってデータXをクラスタリングする
X : array-like or sparse matrix, shape=(n_samples, n_features)
"""
self.__clusters = []
clusters = self.Cluster.build(X, KMeans(self.k_init, **self.k_means_args).fit(X))
self.__recursively_split(clusters)
self.labels_ = np.empty(X.shape[0], dtype = np.intp)
for i, c in enumerate(self.__clusters):
self.labels_[c.index] = i
self.cluster_centers_ = np.array([c.center for c in self.__clusters])
self.cluster_log_likelihoods_ = np.array([c.log_likelihood() for c in self.__clusters])
self.cluster_sizes_ = np.array([c.size for c in self.__clusters])
return self
def __recursively_split(self, clusters):
"""
引数のclustersを再帰的に分割する
clusters : list-like object, which contains instances of 'XMeans.Cluster'
"""
for cluster in clusters:
if cluster.size <= 3:
self.__clusters.append(cluster)
continue
k_means = KMeans(2, **self.k_means_args).fit(cluster.data)
c1, c2 = self.Cluster.build(cluster.data, k_means, cluster.index)
beta = np.linalg.norm(c1.center - c2.center) / np.sqrt(np.linalg.det(c1.cov) + np.linalg.det(c2.cov))
alpha = 0.5 / stats.norm.cdf(beta)
bic = -2 * (cluster.size * np.log(alpha) + c1.log_likelihood() + c2.log_likelihood()) + 2 * cluster.df * np.log(cluster.size)
if bic < cluster.bic():
self.__recursively_split([c1, c2])
else:
self.__clusters.append(cluster)
class Cluster:
"""
k-means法によって生成されたクラスタに関する情報を持ち、尤度やBICの計算を行うクラス
"""
@classmethod
def build(cls, X, k_means, index = None):
if index == None:
index = np.array(range(0, X.shape[0]))
labels = range(0, k_means.get_params()["n_clusters"])
return tuple(cls(X, index, k_means, label) for label in labels)
# index: Xの各行におけるサンプルが元データの何行目のものかを示すベクトル
def __init__(self, X, index, k_means, label):
self.data = X[k_means.labels_ == label]
self.index = index[k_means.labels_ == label]
self.size = self.data.shape[0]
self.df = self.data.shape[1] * (self.data.shape[1] + 3) / 2
self.center = k_means.cluster_centers_[label]
self.cov = np.cov(self.data.T)
def log_likelihood(self):
return sum(stats.multivariate_normal.logpdf(x, self.center, self.cov) for x in self.data)
def bic(self):
return -2 * self.log_likelihood() + self.df * np.log(self.size)
if __name__ == "__main__":
import matplotlib.pyplot as plt
# データの準備
x = np.array([np.random.normal(loc, 0.1, 20) for loc in np.repeat([1,2], 2)]).flatten()
y = np.array([np.random.normal(loc, 0.1, 20) for loc in np.tile([1,2], 2)]).flatten()
# クラスタリングの実行
x_means = XMeans(random_state = 1).fit(np.c_[x,y])
print(x_means.labels_)
print(x_means.cluster_centers_)
print(x_means.cluster_log_likelihoods_)
print(x_means.cluster_sizes_)
# 結果をプロット
plt.rcParams["font.family"] = "Hiragino Kaku Gothic Pro"
plt.scatter(x, y, c = x_means.labels_, s = 30)
plt.scatter(x_means.cluster_centers_[:,0], x_means.cluster_centers_[:,1], c = "r", marker = "+", s = 100)
plt.xlim(0, 3)
plt.ylim(0, 3)
plt.title("改良x-means法の実行結果 参考: 石岡(2000)")
plt.show()
# plt.savefig("clustering.png", dpi = 200)
@mhady7
Copy link

mhady7 commented Jun 24, 2015

Hi,

I get memory errors with the following array which is a little big , Can you please help ?

/home/mayed/Python/Drugs_Algo.py:116: FutureWarning: comparison to None will result in an elementwise object comparison in the future.
if index == None:
MemoryError: MemoryError()

/home/mayed/Python/Drugs_Algo.py(137)()
136 ####Fit and Predict the clusters
1-> 137 x_means = XMeans(random_state = 1,precompute_distances= False).fit(x_train)
138

ipdb> arr1.shape
(315, 13851)

Thanks
Mohamed

@hkubo1983
Copy link

私の環境(Numpy 1.13.0)ですと73行目,if index == None:でValueErrorが発生してしまいました.
if index is None:
に置き換えたところ動作致しましたので,念のためご報告させて頂きます.

@NafeesAhmedAbbasi
Copy link

I want to detect number of colors in a image file. I've used K-mean but I want to set number of cluster automatically. Now I want to use x-mean for automatically detecting clusters. can you help me out how can i do that.

@ysaito8015
Copy link

ysaito8015 commented Mar 13, 2018

57行目

np.linalg.det(c1.cov) と np.linalg.det(c2.cov) がともにゼロの場合, ZeroDivisionError が起きます

下記のように対応しました。

if np.linalg.det(c1.cov) == 0 and np.linalg.det(c2.cov) == 0:
      beta = 0
else:
      beta = np.linalg.norm(c1.center - c2.center) / np.sqrt(np.linalg.det(c1.cov) + np.linalg.det(c2.cov))

@cidrugHug8
Copy link

could you modify your code?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment