Skip to content

Instantly share code, notes, and snippets.

@plotti
Created June 11, 2012 16:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save plotti/2910991 to your computer and use it in GitHub Desktop.
Save plotti/2910991 to your computer and use it in GitHub Desktop.
Smart interest groups matching
#Define how many list places should be considered
MAX = 200
#Threshold: The threshold until which the categories should be merged (e.g. 0.2 = 20 % of members are shared)
THRESHOLD = 0.2
outfile = CSV.open("data/partitions#{MAX}_#{THRESHOLD}.csv", "wb")
final_partition = CSV.open("data/final_partitions#{MAX}_#{THRESHOLD}.csv", "wb")
outfile << ["Name","Original Category", "Original Category Place", "Assigned Category", "Assigned Category Place", "Competing Categories", "Details"]
members ={}
@@communities.each do |community|
project = Project.find(community)
puts "Reading in project #{project.name}"
rows = FasterCSV.read("#{RAILS_ROOT}/data/#{project.name}_sorted_members.csv")[1..MAX] #skip header
i,r = 0,{}
rows.each do |member|
i += 1
r[member[0]] = {:rank => i, :count => member[2].to_i} if !BLACKLIST.include?(member[0])
end
members[project.name] = r
end
merged = {}
#First step should be to unite partitions that have a high overlap of members
@@communities.each do |community|
project = Project.find(community)
#puts "Checking merge on project id: #{community}"
max_overlap_count,overlap_groups_count,overlap_groups,max_group = 0,0,[],""
members.each do |key,value|
if key != project.name && merged[project.name] == nil
overlap_count = (value.keys & members[project.name].keys).count #count how many members they have in common don't compare with yourself
if overlap_count > max_overlap_count
max_overlap_count, max_group = overlap_count, key
end
end
end
if max_overlap_count > MAX*THRESHOLD
puts "Merged #{project.name} with #{max_group}"
merged_name = "#{project.name}_#{max_group}"
h = {}
# Add the counts and merge the members
merged_members = (members[project.name].keys + members[max_group].keys).uniq
merged_members.each do |member|
count1 = members[project.name][member][:count] rescue 0
count2 = members[max_group][member][:count] rescue 0
h[member] = {:rank => 0 , :count => count1+count2}
end
members[merged_name] = h
#Recalculate the ranking for faster lookup
sorted_members = members[merged_name].sort{|a,b| b[1][:count]<=>a[1][:count]}.collect{|a| a[0]}
members[merged_name].keys.each do |member|
members[merged_name][member][:rank] = sorted_members.index(member)+1
end
#Take only the first x members since the categories will grow
members[merged_name] = Hash[members[merged_name].sort{|a,b| b[1][:count]<=>a[1][:count]}[0..MAX]]
#Point where the merged group is stored
[project.name,max_group].each do |entry|
members.delete(entry)
merged[entry] = merged_name
merged.each do |key,value|
if value == entry
merged[key] = merged_name
end
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment