Skip to content

Instantly share code, notes, and snippets.

@tsaylor
Created April 23, 2011 02:08
Show Gist options
  • Save tsaylor/938164 to your computer and use it in GitHub Desktop.
Save tsaylor/938164 to your computer and use it in GitHub Desktop.
This is a tool for measuring the similarity between two lists. The more items that are the same or ranked near each other, the higher the score. I believe I used this to determine the best match for rankings of some items, but I have no idea now.
import random, math
position_weighting = 3.0
max_pos = 10
near_miss_scaling = 1.0 # bigger means higher scores
total_points = 0
total_match_points = 0
sf=[]
def get_match_pct(list1, list2):
total_points = 0
total_match_points = 0
global near_miss_scaling, sf
#for each position
for position in xrange(len(list1)):
#calculate weight based on position
weight = (position_weighting+1)-math.ceil(position*position_weighting/max_pos)
#add weight to total points
total_points += weight
#XXX naive matching
#if answers match
#add weight to total match points
#XXX scaled gap matching (requires all answers to be in both lists!)
#find the position of the answer in list2
position2 = list2.index(list1[position])
#calculate the scaling based on relative positions
scale_factor = 1.0/((math.fabs(position-position2)/near_miss_scaling)+1.0)
if scale_factor not in sf:
print scale_factor
sf.append(scale_factor)
#add scaled weight to total match points
#print "list1: %d[%d] list2: %d[%d] weight: %d scale factor: %s score: %s"% (list1[position], position, list2[position2], position2, weight, str(scale_factor), str(float(weight)*scale_factor))
total_match_points += float(scale_factor) * weight
#calculate match percentage
total_match_points = round(total_match_points)
match_pct = total_match_points * 100 / total_points
return match_pct
def genlist(size):
a = [1,2,3,4,5,6,7,8,9,10]
random.seed()
random.shuffle(a)
return a
count = 0
rresult = []
for i in xrange(10000):
first = genlist(10)
second = genlist(10)
result = get_match_pct(first, second)
rresult += [result]
#count += 1
#print str(result) + " ",
#print rresult
print "0 count: " + str(rresult.count(0))
print "mean: " + str(sum(rresult)/(len(rresult)))
print "min: " + str(min(rresult))
print "max: " + str(max(rresult))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment