Skip to content

Instantly share code, notes, and snippets.

@michellesun
Created October 11, 2012 12:42
Show Gist options
  • Save michellesun/3872046 to your computer and use it in GitHub Desktop.
Save michellesun/3872046 to your computer and use it in GitHub Desktop.
colintch8.py
yahookey = 'cxtFNDLV34GrCw8Ns25KZt30SxLxZ85dZLUVlPCl.Gi0l.s1wrTTGuGclQK6bP9u6yeN'
from xml.dom.minidom import parseString
from urllib import urlopen, quote_plus
class matchrow:
def __init__(self,row,allnum=False):
if allnum:
self.data = [float(row[i]) for i in range(len(row)-1)]
# if allnum, then get the float of all items in row
else:
self.data = row[0:len(row)-1]
self.match = int(row[len(row)-1])
# the last item in the row is 0/1, whether there
# is a match
def loadmatch(f,allnum=False):
row = []
for line in file(f):
rows.append(matchrow(line.split(','),allnum))
# creates a list of matchrow classes
# with raw data and match or not
return rows
def lineartrain(rows):
averages = {}
counts = {}
for row in rows:
#Get the class of this point
cl = row.match
averages.setdefault(cl,[0.0]*(len(row.data)))
# is this a default function? what does it do?
# if key is in the dict, returns value
# if not, insert key with value of default
# returns default (None)
counts.setdefault(cl,0)
# Add this point to the averages
for i in range(len(row.data)):
average[cl][i] += float(row.data[i])
# Keep track of counts of points
counts[cl] += 1
# Divide sums by counts to get averages
for cl, avg in averages.items():
# iterate through the key/value pairs of dict
for i in range(len(avg)):
avg[i] /= counts[cl]
return averages
def dotproduct(v1,v2):
return sum([v1[i]*v2[i] for i in range(len(v1))])
def dpclassify(point,avgs):
b = (dotproduct(avgs[1],avgs[1]) - dotproduct(avgs[0],avgs[0]))/2
y = dotproduct(point,avgs[0]) - dotproduct(point,avgs[1]) + b
if y > 0:
return 0
else:
return 1
def yesno(v):
if v == 'yes':
return 1
elif v == 'no':
return -1
else: #for ambiguous or missing data ('i dont know')
return 0
def matchcount(interest1, interest2):
# good way to count common items in a list
l1 = interest1.split(':')
l2 = interest2.split(':')
x = 0
for v in l1:
if v in l2:
x += 1
return x
def milesdistance(a1,a2):
return 0
loc_cache = {}
def getlocation(address):
if address in loc_cache:
return loc_cache[address]
data = urlopen('http://api.local.yahoo.com/MapService/V1/'+\
'geocode?appid=%s&location=%s' %
(yahookey,quote_plus(address))).read()
# quote_plus replaces spaces by plus signs
doc = parseString(data)
lat = doc.getElementsByTagName('Latitude')[0].firstChild.nodeValue
lng = doc.getElementsByTagName('Longtitude')[0].firstChild.nodeValue
loc_cache[address] = (float(lat),float(lng))
return loc_cache[address]
# create an estimation because strict conversion
# from lat/lng to miles can be tricky
def milesdistance(a1,a2):
# a degree of longitude is approximately 53 miles
# a degree of latitude is approximately 69.1 miles
lat1, lng1 = getlocation(a1)
lat2, lng2 = getlocation(a2)
latdif = 69.1 * (lat2-lat1)
lngdif = 53.0 * (lng2-lng1)
return (latdif**2 + lngdif**2)**.5
# create the new dataset to train the classifier
## Great way to transform data by applying functions
## on each
def loadnumerical():
oldrows = loadmatch('matchmaker.csv')
newrows = []
for row in oldrows:
d = row.data
data = [float(d[0]),yesno(d[1]),yesno(d[2]),
float(d[5]),yesno(d[6]),yesno(d[7]),
matchcount(d[3],d[8]),
milesdistance(d[4],d[9]),
row.match]
newrows.append(matchrow(data))
return newrows
# scaling the data (age differences vs. opinions on
# children)
# scale by determining the highest/lowest value of each variable
def scaledata(rows):
low = [999999999.0]*len(row[0].data)
high = [-999999999.0]*len(row[0].data)
# find highest and lowest
## interesting way to find max/min
for row in rows:
d = row.data
for i in range(len(d)):
if d[i]<low[i]:
low[i] = d[i]
if d[i]>high[i]:
high[i] = d[i]
# create function that scales data
def scaleinput(d):
return [(d.data[i]-low[i])/(high[i]-low[i])
for i in range(len(low))]
# scale all the data
newrows = [matchrow(scaleinput(row.data)+[row.match])
for row in rows]
# return new data and function
return newrows, scaleinput
def rbf(v1,v2,gamma=20):
# similar to the dot product but it can map more complex spaces
dv = [v1[i]-v2[i] for i in range(len(v1))]
l = veclength(dv)
return math.e**(-gamma*l)
def nlclassify(point,rows,offset,gamma=10):
sum0=0.0
sum1=0.0
count0=0
count1=0
for row in rows:
if row.match == 0:
sum0 += rbf(point,row.data,gamma)
count0 += 1
else:
sum1 += rbf(point,row.data,gamma)
count1 +=1
y = (1.0/count0)*sum0 -(1.0/count1)*sum1+offset
if y <0:
return 0
else:
return 1
def getoffset(rows,gamma=10):
l0 = []
l1 = []
for row in rows:
if row.match == 0:
l0.append(row.data)
else:
l1.append(row.data)
sum0 = sum(sum([rbf(v1,v2,gamma) for v1 in l0]) for v2 in l0)
sum1 = sum(sum([rbf(v1,v2,gamma) for v1 in l1]) for v2 in l1)
return (1.0/(len(l1)**2))*sum1-(1.0/(len(l0)**2))*sum0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment