Skip to content

Instantly share code, notes, and snippets.

@cloverrose
Created September 30, 2013 06:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cloverrose/6760073 to your computer and use it in GitHub Desktop.
Save cloverrose/6760073 to your computer and use it in GitHub Desktop.
k-means法の初期クラスタリングを改良したk-means++法(http://ja.wikipedia.org/wiki/K-means%2B%2B%E6%B3%95http://rosettacode.org/wiki/K-means%2B%2B_clustering#Python を参考にした
# -*- coding:utf-8 -*-
from __future__ import division
import random
import operator
import itertools
import sys
def calc_distance(point1, point2):
return sum([(x - y) ** 2 for x, y in zip(point1, point2)]) ** 0.5
def calc_centroid(index, points, assigns):
match = [p for p, assign in zip(points, assigns) if assign == index]
n = len(match)
if n == 0:
return [0] * len(points[0])
else:
return [sum(x) / n for x in zip(*match)]
def calc_distance_between_nearest_centroid(point, centroids):
distances = [calc_distance(point, centroid) for centroid in centroids]
nearest_centroid, nearest_distance = min(enumerate(distances),
key=operator.itemgetter(1))
return nearest_centroid, nearest_distance
def kpp(points, k):
centroids = []
centroids.append(random.choice(points)[:])
for i in xrange(1, k):
distances = [calc_distance_between_nearest_centroid(p, centroids)[1]
for p in points]
sum_distance = sum(distances) * random.random()
for j, distance in enumerate(distances):
sum_distance -= distance
if sum_distance <= 0:
centroids.append(points[j][:])
break
assigns = [calc_distance_between_nearest_centroid(p, centroids)[0]
for p in points]
return centroids, assigns
def start(points, k):
centroids, assigns = kpp(points, k)
for count in itertools.count():
prev_assigns = assigns[:]
centroids = [calc_centroid(i, points, assigns) for i in xrange(k)]
assigns = [calc_distance_between_nearest_centroid(p, centroids)[0]
for p in points]
if assigns == prev_assigns:
sys.stderr.write('num of iterations: {0}\n'.format(count))
break
return assigns
def make_sample(dimension, num):
points = [[random.random() for _ in xrange(dimension)]
for _ in xrange(num)]
for i, p in enumerate(points):
if i < num / 2:
p[0] += 0.5
else:
p[0] -= 0.5
return points
def test():
points = make_sample(100, 100)
assigns = start(points, 2)
print assigns
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment