Skip to content

Instantly share code, notes, and snippets.

@mverzilli
Created Mar 7, 2016
Embed
What would you like to do?
Crystal implementation of Kmeans for a poll @ Manas,
require "csv"
MINIMUM_DISTANCE = 0.85
SIMILARITY_LOW_BOUND = 0.30
SIMILARITY_HIGH_BOUND = 0.60
BIT_DICTIONARY = ["1a", "1b", "1c", "1d", "2a", "2b", "2c", "2d", "3a", "3b", "4a", "4b", "4c", "5a", "5b", "5c", "5d", "6a", "6b", "6c", "7a", "7b", "7c", "8a", "8b", "8c", "9a", "9b", "9c", "9d", "9e", "9f", "9g", "10a", "10b", "10c", "10d", "11a", "11b", "12a", "12b", "12c", "13a", "13b", "13c", "13d"]
RANDOM = Random.new 23
struct KMeansRun
property responses
property avg_vector
property avg_distance
def initialize(@responses, @avg_vector, @avg_distance)
end
end
struct Response
property vector
property author
def initialize(@vector, @author)
end
end
def read_poll
CSV.parse (File.open "manas.csv")
end
def avg(vectors)
columns = vectors.transpose
columns.map do |c|
(c.inject(0.0) do |mem, var|
mem += var.to_f
mem
end) / c.size
end
end
def dot(v1, v2)
res = 0
(0...v1.size).each do |i|
res += v1[i] * v2[i]
end
res
end
def norm(v)
Math.sqrt(v.map { |x| x * x }.sum)
end
def distance(v1, v2)
k = dot(v1, v2) / (norm(v1) * norm(v2))
end
def random_bisect(responses)
randomized = responses.shuffle(RANDOM)
return randomized.take(randomized.size / 2), randomized.skip(randomized.size - randomized.size / 2)
end
def kmeans(responses)
pure_answers = responses.map &.vector
avg_vector = avg pure_answers
distances = pure_answers.map { |v| distance(avg_vector, v) }
avg_distance = distances.sum / distances.size
KMeansRun.new responses, avg_vector, avg_distance
end
def empty_groups(groups)
groups.map { |g| KMeansRun.new ([] of Response), g.avg_vector, g.avg_distance }
end
def add_to_closest_group(response, groups)
groups.max_by { |g| distance(response.vector, g.avg_vector) }.responses.push(response)
end
def compute_groups(groups, all_responses)
if groups.all? { |g| g.avg_distance >= MINIMUM_DISTANCE }
groups
else
new_groups = groups.map do |g|
if g.avg_distance < MINIMUM_DISTANCE
responses1, responses2 = random_bisect(g.responses)
[kmeans(responses1), kmeans(responses2)]
else
g
end
end.flatten
groups_to_fill = empty_groups(new_groups)
all_responses.each { |r| add_to_closest_group(r, groups_to_fill) }
new_computed_groups = groups_to_fill.map { |g| kmeans g.responses }
compute_groups(new_computed_groups, all_responses)
end
end
def sort_by_distance_to_avg!(coll)
coll.sort! do |b1, b2|
if (g.avg_vector[b1] - 0.5).abs > (g.avg_vector[b2] - 0.5).abs
-1
elsif (g.avg_vector[b1] - 0.5).abs == (g.avg_vector[b2] - 0.5).abs
0
else
1
end
end
end
poll = read_poll
responses = poll.map { |row| Response.new(row[0..-2].map(&.to_i), row[-1]) }
computed_groups = compute_groups [kmeans responses], responses
p "Finishing..."
pp computed_groups
pp computed_groups.size
computed_groups.each_with_index do |g, i|
p "**********************************************************************"
p "Analyzing class #{i}"
p "Average vector is: #{g.avg_vector}"
members = g.responses.map &.author
p "Members are: #{members}"
similar = [] of Int32
different = [] of Int32
g.avg_vector.each_with_index do |b, i|
if b < SIMILARITY_LOW_BOUND || b > SIMILARITY_HIGH_BOUND
similar.push i
else
different.push i
end
end
sort_by_distance_to_avg!(similar)
sort_by_distance_to_avg!(different)
similar_traits = similar.map { |b| "#{BIT_DICTIONARY[b]}(#{g.avg_vector[b]}), Abs: #{(g.avg_vector[b] - 0.5).abs}" }
different_traits = different.map { |b| "#{BIT_DICTIONARY[b]}(#{g.avg_vector[b]}), Abs: #{(g.avg_vector[b] - 0.5).abs}" }
p "Similar traits are: #{similar_traits}"
p "Different traits are: #{different_traits}"
p "**********************************************************************"
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment