Last active
August 29, 2015 14:12
-
-
Save ymurase/96f3b28c2618a10cf144 to your computer and use it in GitHub Desktop.
K-means clustering in 2D periodic boundary condition. The length of the system is 1. Calculation of AIC contains a bug (TODO)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'pp' | |
EPSILON = 0.0001 | |
def load_points(io) | |
points = io.map do |line| | |
line.chomp.split.map(&:to_f) | |
end | |
points | |
end | |
def kmeans_aic(kmin, kmax, points, num_iterations) | |
min_aic = Float::MAX | |
best_clusters = [] | |
(kmin..kmax).each do |k| | |
$stderr.puts "K: #{k}" | |
cost, clusters = kmeans_pbc_iteration(k, points, num_iterations) | |
aic = 2*k - 2*Math.log(cost) | |
aicc = aic + ( 2*k*(k+1) / (points.size - k - 1) ) | |
pp "cost: #{cost}, aic: #{aic}, aicc: #{aicc}" | |
if aicc < min_aic | |
min_aic = aicc | |
best_clusters = clusters | |
end | |
end | |
[min_aic, best_clusters] | |
end | |
def kmeans_pbc_iteration(k, points, num_iterations) | |
min_cost = Float::MAX | |
best_clusters = [] | |
num_iterations.times do |t| | |
$stderr.puts " iteration: #{t}" | |
cost, clusters = k_means_pbc(k, points) | |
if cost < min_cost | |
min_cost = cost | |
best_clusters = clusters | |
end | |
end | |
[min_cost, best_clusters] | |
end | |
def k_means_pbc(k, points) | |
centers = Array.new(k) {|idx| [rand, rand] } | |
prev_cost = Float::MAX | |
cost = 0.0 | |
loop do | |
centers, cost = update_centers(points, centers) | |
# $stderr.puts " cost: #{cost}" | |
break if prev_cost - cost < EPSILON | |
prev_cost = cost | |
end | |
clusters = get_cluster_indexes(points, centers) | |
[cost, clusters] | |
end | |
def calculate_cm_in_pbc(points) | |
calculate_cm = ->(xs) { | |
thetas = xs.map {|x| 2*Math::PI*x } | |
ave_xi = thetas.map {|theta| Math.cos(theta) }.inject(:+) / thetas.size | |
ave_zeta = thetas.map {|theta| Math.sin(theta) }.inject(:+) / thetas.size | |
theta_bar = Math.atan2(-ave_zeta, -ave_xi) + Math::PI | |
theta_bar / (2*Math::PI) | |
} | |
cm_x = calculate_cm.call( points.map {|point| point[0]} ) | |
cm_y = calculate_cm.call( points.map {|point| point[1]} ) | |
[cm_x, cm_y] | |
end | |
def update_centers(points, centers) | |
clusters = get_cluster_indexes(points, centers) | |
points_in_cluster = Hash.new {|hash,key| hash[key] = Array.new } | |
clusters.zip(points).each {|c,p| points_in_cluster[c].push(p) } | |
cms = Hash[ | |
points_in_cluster.map do |c,ps| | |
cm = ps.empty? ? points.choice : calculate_cm_in_pbc(ps) | |
[c, cm] | |
end | |
] | |
cost_total = points.zip(clusters) | |
.map {|p,c| distance(p, cms[c]) ** 2 } | |
.inject(:+) | |
[cms.values, cost_total] | |
end | |
def get_cluster_indexes(points, centers) | |
indexes = points.map do |point| | |
cluster_index = centers.each_with_index.min_by {|c,idx| distance(point,c) }[1] | |
end | |
indexes | |
end | |
def distance(p1,p2) | |
dx = (p1[0]-p2[0]).abs | |
dx = 1.0 - dx if dx > 0.5 | |
dy = (p1[1]-p2[1]).abs | |
dy = 1.0 - dy if dy > 0.5 | |
Math.sqrt(dx*dx+dy*dy) | |
end | |
unless ARGV.size == 4 | |
$stderr.puts "Usage: ruby #{__FILE__} <k_max> points.dat num_iterations seed" | |
raise "invalid arguments" | |
end | |
k_max = ARGV[0].to_i | |
points = load_points(File.open(ARGV[1]) ) | |
num_iterations = ARGV[2].to_i | |
srand(ARGV[3].to_i) | |
# cost,clusters = kmeans_pbc_iteration(k, points, num_iterations) | |
cost,clusters = kmeans_aic(1, k_max, points, num_iterations) | |
h = Hash.new {|h,k| h[k] = [] } | |
clusters.zip(points).each {|c,point| h[c].push point } | |
# pp cost, h |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment