-
-
Save wflanagan/c5ca62129cdc6a1a61a0 to your computer and use it in GitHub Desktop.
Kmeans clusterer for hashes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'kmeans-clusterer' | |
# move this to Oma::Api::Kmeans when its working and we | |
# decide to use it | |
module Maintenance | |
class Kmeans | |
attr_accessor :profiles | |
def initialize(profiles) | |
@profiles = profiles | |
end | |
# define the list of words in their profiles | |
# TODO - remove stop words | |
def words | |
@words ||= begin | |
@profiles.map do |p| | |
profile_to_string(p).split(/\W+/) | |
end.flatten.uniq | |
end | |
end | |
# n dimennsional vector, 1 for each profile | |
def vector_index | |
@vector_index ||= begin | |
indx = {} | |
profiles.each_with_index do |p, i| | |
vector = words.map do |w| | |
profile_to_string(p).include?(w) ? 1 : 0 | |
end | |
indx[i] = vector | |
end | |
indx | |
end | |
end | |
# used to select the attributes that get merged together | |
# for purposes of this calculation | |
def profile_to_string(profile) | |
"#{profile['username']} #{profile['name']} #{profile['description']}".to_s.downcase | |
end | |
def target_cluster_count | |
@target_cluster_count ||= (profiles.length.to_i.to_f / 3).to_i rescue 3 | |
end | |
# the kmeans calculator | |
def kmeans | |
@kmeans ||= KMeansClusterer.run target_cluster_count, vector_index.values, labels: vector_index.keys, runs: 5 | |
end | |
def clusters | |
kmeans.clusters | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment