Skip to content

Instantly share code, notes, and snippets.

@ymurase
Last active August 29, 2015 14:12
Show Gist options
  • Save ymurase/96f3b28c2618a10cf144 to your computer and use it in GitHub Desktop.
Save ymurase/96f3b28c2618a10cf144 to your computer and use it in GitHub Desktop.
K-means clustering in 2D periodic boundary condition. The length of the system is 1. Calculation of AIC contains a bug (TODO)
require 'pp'
EPSILON = 0.0001
def load_points(io)
points = io.map do |line|
line.chomp.split.map(&:to_f)
end
points
end
def kmeans_aic(kmin, kmax, points, num_iterations)
min_aic = Float::MAX
best_clusters = []
(kmin..kmax).each do |k|
$stderr.puts "K: #{k}"
cost, clusters = kmeans_pbc_iteration(k, points, num_iterations)
aic = 2*k - 2*Math.log(cost)
aicc = aic + ( 2*k*(k+1) / (points.size - k - 1) )
pp "cost: #{cost}, aic: #{aic}, aicc: #{aicc}"
if aicc < min_aic
min_aic = aicc
best_clusters = clusters
end
end
[min_aic, best_clusters]
end
def kmeans_pbc_iteration(k, points, num_iterations)
min_cost = Float::MAX
best_clusters = []
num_iterations.times do |t|
$stderr.puts " iteration: #{t}"
cost, clusters = k_means_pbc(k, points)
if cost < min_cost
min_cost = cost
best_clusters = clusters
end
end
[min_cost, best_clusters]
end
def k_means_pbc(k, points)
centers = Array.new(k) {|idx| [rand, rand] }
prev_cost = Float::MAX
cost = 0.0
loop do
centers, cost = update_centers(points, centers)
# $stderr.puts " cost: #{cost}"
break if prev_cost - cost < EPSILON
prev_cost = cost
end
clusters = get_cluster_indexes(points, centers)
[cost, clusters]
end
def calculate_cm_in_pbc(points)
calculate_cm = ->(xs) {
thetas = xs.map {|x| 2*Math::PI*x }
ave_xi = thetas.map {|theta| Math.cos(theta) }.inject(:+) / thetas.size
ave_zeta = thetas.map {|theta| Math.sin(theta) }.inject(:+) / thetas.size
theta_bar = Math.atan2(-ave_zeta, -ave_xi) + Math::PI
theta_bar / (2*Math::PI)
}
cm_x = calculate_cm.call( points.map {|point| point[0]} )
cm_y = calculate_cm.call( points.map {|point| point[1]} )
[cm_x, cm_y]
end
def update_centers(points, centers)
clusters = get_cluster_indexes(points, centers)
points_in_cluster = Hash.new {|hash,key| hash[key] = Array.new }
clusters.zip(points).each {|c,p| points_in_cluster[c].push(p) }
cms = Hash[
points_in_cluster.map do |c,ps|
cm = ps.empty? ? points.choice : calculate_cm_in_pbc(ps)
[c, cm]
end
]
cost_total = points.zip(clusters)
.map {|p,c| distance(p, cms[c]) ** 2 }
.inject(:+)
[cms.values, cost_total]
end
def get_cluster_indexes(points, centers)
indexes = points.map do |point|
cluster_index = centers.each_with_index.min_by {|c,idx| distance(point,c) }[1]
end
indexes
end
def distance(p1,p2)
dx = (p1[0]-p2[0]).abs
dx = 1.0 - dx if dx > 0.5
dy = (p1[1]-p2[1]).abs
dy = 1.0 - dy if dy > 0.5
Math.sqrt(dx*dx+dy*dy)
end
unless ARGV.size == 4
$stderr.puts "Usage: ruby #{__FILE__} <k_max> points.dat num_iterations seed"
raise "invalid arguments"
end
k_max = ARGV[0].to_i
points = load_points(File.open(ARGV[1]) )
num_iterations = ARGV[2].to_i
srand(ARGV[3].to_i)
# cost,clusters = kmeans_pbc_iteration(k, points, num_iterations)
cost,clusters = kmeans_aic(1, k_max, points, num_iterations)
h = Hash.new {|h,k| h[k] = [] }
clusters.zip(points).each {|c,point| h[c].push point }
# pp cost, h
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment