Created
March 7, 2016 18:02
-
-
Save mverzilli/21092b3e639a64e44816 to your computer and use it in GitHub Desktop.
Crystal implementation of Kmeans for a poll @ Manas,
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "csv" | |
MINIMUM_DISTANCE = 0.85 | |
SIMILARITY_LOW_BOUND = 0.30 | |
SIMILARITY_HIGH_BOUND = 0.60 | |
BIT_DICTIONARY = ["1a", "1b", "1c", "1d", "2a", "2b", "2c", "2d", "3a", "3b", "4a", "4b", "4c", "5a", "5b", "5c", "5d", "6a", "6b", "6c", "7a", "7b", "7c", "8a", "8b", "8c", "9a", "9b", "9c", "9d", "9e", "9f", "9g", "10a", "10b", "10c", "10d", "11a", "11b", "12a", "12b", "12c", "13a", "13b", "13c", "13d"] | |
RANDOM = Random.new 23 | |
struct KMeansRun | |
property responses | |
property avg_vector | |
property avg_distance | |
def initialize(@responses, @avg_vector, @avg_distance) | |
end | |
end | |
struct Response | |
property vector | |
property author | |
def initialize(@vector, @author) | |
end | |
end | |
def read_poll | |
CSV.parse (File.open "manas.csv") | |
end | |
def avg(vectors) | |
columns = vectors.transpose | |
columns.map do |c| | |
(c.inject(0.0) do |mem, var| | |
mem += var.to_f | |
mem | |
end) / c.size | |
end | |
end | |
def dot(v1, v2) | |
res = 0 | |
(0...v1.size).each do |i| | |
res += v1[i] * v2[i] | |
end | |
res | |
end | |
def norm(v) | |
Math.sqrt(v.map { |x| x * x }.sum) | |
end | |
def distance(v1, v2) | |
k = dot(v1, v2) / (norm(v1) * norm(v2)) | |
end | |
def random_bisect(responses) | |
randomized = responses.shuffle(RANDOM) | |
return randomized.take(randomized.size / 2), randomized.skip(randomized.size - randomized.size / 2) | |
end | |
def kmeans(responses) | |
pure_answers = responses.map &.vector | |
avg_vector = avg pure_answers | |
distances = pure_answers.map { |v| distance(avg_vector, v) } | |
avg_distance = distances.sum / distances.size | |
KMeansRun.new responses, avg_vector, avg_distance | |
end | |
def empty_groups(groups) | |
groups.map { |g| KMeansRun.new ([] of Response), g.avg_vector, g.avg_distance } | |
end | |
def add_to_closest_group(response, groups) | |
groups.max_by { |g| distance(response.vector, g.avg_vector) }.responses.push(response) | |
end | |
def compute_groups(groups, all_responses) | |
if groups.all? { |g| g.avg_distance >= MINIMUM_DISTANCE } | |
groups | |
else | |
new_groups = groups.map do |g| | |
if g.avg_distance < MINIMUM_DISTANCE | |
responses1, responses2 = random_bisect(g.responses) | |
[kmeans(responses1), kmeans(responses2)] | |
else | |
g | |
end | |
end.flatten | |
groups_to_fill = empty_groups(new_groups) | |
all_responses.each { |r| add_to_closest_group(r, groups_to_fill) } | |
new_computed_groups = groups_to_fill.map { |g| kmeans g.responses } | |
compute_groups(new_computed_groups, all_responses) | |
end | |
end | |
def sort_by_distance_to_avg!(coll) | |
coll.sort! do |b1, b2| | |
if (g.avg_vector[b1] - 0.5).abs > (g.avg_vector[b2] - 0.5).abs | |
-1 | |
elsif (g.avg_vector[b1] - 0.5).abs == (g.avg_vector[b2] - 0.5).abs | |
0 | |
else | |
1 | |
end | |
end | |
end | |
poll = read_poll | |
responses = poll.map { |row| Response.new(row[0..-2].map(&.to_i), row[-1]) } | |
computed_groups = compute_groups [kmeans responses], responses | |
p "Finishing..." | |
pp computed_groups | |
pp computed_groups.size | |
computed_groups.each_with_index do |g, i| | |
p "**********************************************************************" | |
p "Analyzing class #{i}" | |
p "Average vector is: #{g.avg_vector}" | |
members = g.responses.map &.author | |
p "Members are: #{members}" | |
similar = [] of Int32 | |
different = [] of Int32 | |
g.avg_vector.each_with_index do |b, i| | |
if b < SIMILARITY_LOW_BOUND || b > SIMILARITY_HIGH_BOUND | |
similar.push i | |
else | |
different.push i | |
end | |
end | |
sort_by_distance_to_avg!(similar) | |
sort_by_distance_to_avg!(different) | |
similar_traits = similar.map { |b| "#{BIT_DICTIONARY[b]}(#{g.avg_vector[b]}), Abs: #{(g.avg_vector[b] - 0.5).abs}" } | |
different_traits = different.map { |b| "#{BIT_DICTIONARY[b]}(#{g.avg_vector[b]}), Abs: #{(g.avg_vector[b] - 0.5).abs}" } | |
p "Similar traits are: #{similar_traits}" | |
p "Different traits are: #{different_traits}" | |
p "**********************************************************************" | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment