Skip to content

Instantly share code, notes, and snippets.

@Kensuke-Mitsuzawa
Last active August 29, 2015 14:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Kensuke-Mitsuzawa/ff2391055ac993090a10 to your computer and use it in GitHub Desktop.
Save Kensuke-Mitsuzawa/ff2391055ac993090a10 to your computer and use it in GitHub Desktop.
cluster analysis with PCA using scikit-learn
#! /usr/bin/python
# -coding:utf-8 -*-
import numpy as np
from sklearn.decomposition import PCA
import pandas
import logging
import os, sys, codecs, json
from sklearn.cluster import KMeans
from sklearn import datasets
#rootロガーを取得
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
#出力のフォーマットを定義
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
#sys.stderrへ出力するハンドラーを定義
sh = logging.StreamHandler()
sh.setLevel(logging.DEBUG)
sh.setFormatter(formatter)
#rootロガーにハンドラーを登録する
logger.addHandler(sh)
def CallPCA(dataset, dimention=3):
"""
PCAで射影
dimentionオプションに射影後の次元数を指定
INPUT: class np.array
RETURN: numpy.ndarray
"""
print 'Ipnput data dims:{}'.format(dataset.shape[1])
pca = PCA(n_components=dimention)
pca.fit(dataset)
dataset_low_dm = pca.transform(dataset)
print 'Output dims:{}'.format(dataset_low_dm.shape[1])
return dataset_low_dm
def CallKmeans(input_array, NUM_CLUSTERS=100):
"""
INPUT: numpy.ndarray input_array
OUTPUT:
"""
km = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init = 1, verbose=True) # Kmeansインスタンスを作成
km.fit(input_array) # 実データにfitting
labels = km.labels_ # 各要素にクラス番号をふる 戻り値はnumpy.ndarray
#transformed = km.transform(input_array) # クラスターの空間での座標に射影する
#km.predict(input_array) # 各データの最も近いクラスタを予測する
return labels
def KmeansTest():
"""
Kemansの動作テスト
コードの詳細は
http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
"""
from sklearn.cluster import KMeans
from sklearn import datasets
NUM_CLUSTERS = 5
np.random.seed(5)
centers = [[1, 1], [-1, -1], [1, -1]]
iris = datasets.load_iris()
X = iris.data
y = iris.target
km = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init=1, verbose=True) # Kmeansインスタンスを作成
km.fit(X) # 実データにfitting
labels = km.labels_ # 各要素にクラス番号をふる 戻り値はnumpy.ndarray
transformed = km.transform(X) # クラスターの空間での座標に射影する
print km.predict(X) # 各データの最も近いクラスタを予測する
def Test01():
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
type(CallPCA(X))
def Test02():
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
Y = iris.target
print X
y = CallPCA(X, dimention=2)
print y
def Normalize(input_array):
"""
正規化する
http://scikit-learn.org/stable/modules/preprocessing.html
scale処理はこれ
http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.scale.html#sklearn.preprocessing.scale
"""
import sklearn
input_array = sklearn.preprocessing.normalize(input_array)
input_array = sklearn.preprocessing.scale(input_array)
return input_array
def ReadFromCSV(input_path):
"""
csvで読んでnumpy.arrayにする
"""
logger.debug('start Loading DF')
input_df = pandas.read_csv(input_path)
logger.debug('Finished Loading DF')
del input_df['follower'] # 列の削除
input_df = input_df.T # そのまま変換すると、サンプルと次元が逆になるので、ここで転置
numpyMatrix = input_df.as_matrix()
numpyMatrix = numpyMatrix.astype(np.float64) # 64ビットのfloat型に変換
logger.debug('Finished converting to numpay array')
return numpyMatrix
input_path = './test_input.csv'
input_array = ReadFromCSV(input_path)
input_array = Normalize(input_array)
logger.debug('start PCA')
low_dim_array = CallPCA(input_array, dimention=input_array.shape[1])
logger.debug('end PCA')
logger.debug('start Kmeans')
kmens_result_map = {}
cluster_num_list = [1,3,5] # ここにkmeansのクラスタ数を指定
low_dim_array = np.transpose(low_dim_array) # クラスタ分けしたいのは「follower」なので、転置する
for iter_num, cluster_num in enumerate(cluster_num_list):
logger.debug('iter: {}'.format(iter_num))
labels_array = CallKmeans(low_dim_array, NUM_CLUSTERS=cluster_num)
key_name = 'k_{}'.format(cluster_num)
kmens_result_map[key_name] = labels_array.tolist()
logger.debug('end Kmeans')
label_df = pandas.DataFrame(kmens_result_map)
value_df = pandas.DataFrame(low_dim_array)
output_df = pandas.concat([label_df, value_df], axis=1)
output_df.to_csv(path_or_buf='./cluster_result.csv', sep=',', header=True, index=True)
follower,aaaa,bbbb,cccc,dddd,eeeee,fffff
m01,0,0,0,0,1,1
m02,1,1,1,1,0,0
m03,1,0,1,0,1,1
m04,0,0,0,0,0,1
m05,1,1,0,1,0,1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment