Created
April 23, 2011 02:08
-
-
Save tsaylor/938164 to your computer and use it in GitHub Desktop.
This is a tool for measuring the similarity between two lists. The more items that are the same or ranked near each other, the higher the score. I believe I used this to determine the best match for rankings of some items, but I have no idea now.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random, math | |
position_weighting = 3.0 | |
max_pos = 10 | |
near_miss_scaling = 1.0 # bigger means higher scores | |
total_points = 0 | |
total_match_points = 0 | |
sf=[] | |
def get_match_pct(list1, list2): | |
total_points = 0 | |
total_match_points = 0 | |
global near_miss_scaling, sf | |
#for each position | |
for position in xrange(len(list1)): | |
#calculate weight based on position | |
weight = (position_weighting+1)-math.ceil(position*position_weighting/max_pos) | |
#add weight to total points | |
total_points += weight | |
#XXX naive matching | |
#if answers match | |
#add weight to total match points | |
#XXX scaled gap matching (requires all answers to be in both lists!) | |
#find the position of the answer in list2 | |
position2 = list2.index(list1[position]) | |
#calculate the scaling based on relative positions | |
scale_factor = 1.0/((math.fabs(position-position2)/near_miss_scaling)+1.0) | |
if scale_factor not in sf: | |
print scale_factor | |
sf.append(scale_factor) | |
#add scaled weight to total match points | |
#print "list1: %d[%d] list2: %d[%d] weight: %d scale factor: %s score: %s"% (list1[position], position, list2[position2], position2, weight, str(scale_factor), str(float(weight)*scale_factor)) | |
total_match_points += float(scale_factor) * weight | |
#calculate match percentage | |
total_match_points = round(total_match_points) | |
match_pct = total_match_points * 100 / total_points | |
return match_pct | |
def genlist(size): | |
a = [1,2,3,4,5,6,7,8,9,10] | |
random.seed() | |
random.shuffle(a) | |
return a | |
count = 0 | |
rresult = [] | |
for i in xrange(10000): | |
first = genlist(10) | |
second = genlist(10) | |
result = get_match_pct(first, second) | |
rresult += [result] | |
#count += 1 | |
#print str(result) + " ", | |
#print rresult | |
print "0 count: " + str(rresult.count(0)) | |
print "mean: " + str(sum(rresult)/(len(rresult))) | |
print "min: " + str(min(rresult)) | |
print "max: " + str(max(rresult)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment