Last active
August 29, 2015 14:05
-
-
Save Kensuke-Mitsuzawa/ff2391055ac993090a10 to your computer and use it in GitHub Desktop.
cluster analysis with PCA using scikit-learn
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/python | |
# -coding:utf-8 -*- | |
import numpy as np | |
from sklearn.decomposition import PCA | |
import pandas | |
import logging | |
import os, sys, codecs, json | |
from sklearn.cluster import KMeans | |
from sklearn import datasets | |
#rootロガーを取得 | |
logger = logging.getLogger() | |
logger.setLevel(logging.DEBUG) | |
#出力のフォーマットを定義 | |
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') | |
#sys.stderrへ出力するハンドラーを定義 | |
sh = logging.StreamHandler() | |
sh.setLevel(logging.DEBUG) | |
sh.setFormatter(formatter) | |
#rootロガーにハンドラーを登録する | |
logger.addHandler(sh) | |
def CallPCA(dataset, dimention=3): | |
""" | |
PCAで射影 | |
dimentionオプションに射影後の次元数を指定 | |
INPUT: class np.array | |
RETURN: numpy.ndarray | |
""" | |
print 'Ipnput data dims:{}'.format(dataset.shape[1]) | |
pca = PCA(n_components=dimention) | |
pca.fit(dataset) | |
dataset_low_dm = pca.transform(dataset) | |
print 'Output dims:{}'.format(dataset_low_dm.shape[1]) | |
return dataset_low_dm | |
def CallKmeans(input_array, NUM_CLUSTERS=100): | |
""" | |
INPUT: numpy.ndarray input_array | |
OUTPUT: | |
""" | |
km = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init = 1, verbose=True) # Kmeansインスタンスを作成 | |
km.fit(input_array) # 実データにfitting | |
labels = km.labels_ # 各要素にクラス番号をふる 戻り値はnumpy.ndarray | |
#transformed = km.transform(input_array) # クラスターの空間での座標に射影する | |
#km.predict(input_array) # 各データの最も近いクラスタを予測する | |
return labels | |
def KmeansTest(): | |
""" | |
Kemansの動作テスト | |
コードの詳細は | |
http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html | |
""" | |
from sklearn.cluster import KMeans | |
from sklearn import datasets | |
NUM_CLUSTERS = 5 | |
np.random.seed(5) | |
centers = [[1, 1], [-1, -1], [1, -1]] | |
iris = datasets.load_iris() | |
X = iris.data | |
y = iris.target | |
km = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init=1, verbose=True) # Kmeansインスタンスを作成 | |
km.fit(X) # 実データにfitting | |
labels = km.labels_ # 各要素にクラス番号をふる 戻り値はnumpy.ndarray | |
transformed = km.transform(X) # クラスターの空間での座標に射影する | |
print km.predict(X) # 各データの最も近いクラスタを予測する | |
def Test01(): | |
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) | |
type(CallPCA(X)) | |
def Test02(): | |
from sklearn import datasets | |
iris = datasets.load_iris() | |
X = iris.data | |
Y = iris.target | |
print X | |
y = CallPCA(X, dimention=2) | |
print y | |
def Normalize(input_array): | |
""" | |
正規化する | |
http://scikit-learn.org/stable/modules/preprocessing.html | |
scale処理はこれ | |
http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.scale.html#sklearn.preprocessing.scale | |
""" | |
import sklearn | |
input_array = sklearn.preprocessing.normalize(input_array) | |
input_array = sklearn.preprocessing.scale(input_array) | |
return input_array | |
def ReadFromCSV(input_path): | |
""" | |
csvで読んでnumpy.arrayにする | |
""" | |
logger.debug('start Loading DF') | |
input_df = pandas.read_csv(input_path) | |
logger.debug('Finished Loading DF') | |
del input_df['follower'] # 列の削除 | |
input_df = input_df.T # そのまま変換すると、サンプルと次元が逆になるので、ここで転置 | |
numpyMatrix = input_df.as_matrix() | |
numpyMatrix = numpyMatrix.astype(np.float64) # 64ビットのfloat型に変換 | |
logger.debug('Finished converting to numpay array') | |
return numpyMatrix | |
input_path = './test_input.csv' | |
input_array = ReadFromCSV(input_path) | |
input_array = Normalize(input_array) | |
logger.debug('start PCA') | |
low_dim_array = CallPCA(input_array, dimention=input_array.shape[1]) | |
logger.debug('end PCA') | |
logger.debug('start Kmeans') | |
kmens_result_map = {} | |
cluster_num_list = [1,3,5] # ここにkmeansのクラスタ数を指定 | |
low_dim_array = np.transpose(low_dim_array) # クラスタ分けしたいのは「follower」なので、転置する | |
for iter_num, cluster_num in enumerate(cluster_num_list): | |
logger.debug('iter: {}'.format(iter_num)) | |
labels_array = CallKmeans(low_dim_array, NUM_CLUSTERS=cluster_num) | |
key_name = 'k_{}'.format(cluster_num) | |
kmens_result_map[key_name] = labels_array.tolist() | |
logger.debug('end Kmeans') | |
label_df = pandas.DataFrame(kmens_result_map) | |
value_df = pandas.DataFrame(low_dim_array) | |
output_df = pandas.concat([label_df, value_df], axis=1) | |
output_df.to_csv(path_or_buf='./cluster_result.csv', sep=',', header=True, index=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
follower,aaaa,bbbb,cccc,dddd,eeeee,fffff | |
m01,0,0,0,0,1,1 | |
m02,1,1,1,1,0,0 | |
m03,1,0,1,0,1,1 | |
m04,0,0,0,0,0,1 | |
m05,1,1,0,1,0,1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment