michellesun/colint ch8 svm

## colint ch8 svm
yahookey = 'cxtFNDLV34GrCw8Ns25KZt30SxLxZ85dZLUVlPCl.Gi0l.s1wrTTGuGclQK6bP9u6yeN'
from xml.dom.minidom import parseString
from urllib import urlopen, quote_plus

class matchrow:
	def __init__(self,row,allnum=False):
		if allnum:
			self.data = [float(row[i]) for i in range(len(row)-1)]
			# if allnum, then get the float of all items in row
		else:
			self.data = row[0:len(row)-1]
		self.match = int(row[len(row)-1])
		# the last item in the row is 0/1, whether there
		# is a match

def loadmatch(f,allnum=False):
	row = []
	for line in file(f):
		rows.append(matchrow(line.split(','),allnum))
		# creates a list of matchrow classes
		# with raw data and match or not
	return rows

def lineartrain(rows):
	averages = {}
	counts = {}

	for row in rows:
		#Get the class of this point
		cl = row.match

		averages.setdefault(cl,[0.0]*(len(row.data)))
		# is this a default function? what does it do?
		# if key is in the dict, returns value
		# if not, insert key with value of default
		# returns default (None)
		counts.setdefault(cl,0)

		# Add this point to the averages
		for i in range(len(row.data)):
			average[cl][i] += float(row.data[i])

		# Keep track of counts of points
		counts[cl] += 1

	# Divide sums by counts to get averages
	for cl, avg in averages.items():
	# iterate through the key/value pairs of dict
		for i in range(len(avg)):
			avg[i] /= counts[cl]

	return averages

def dotproduct(v1,v2):
	return sum([v1[i]*v2[i] for i in range(len(v1))])

def dpclassify(point,avgs):
	b = (dotproduct(avgs[1],avgs[1]) - dotproduct(avgs[0],avgs[0]))/2
	y = dotproduct(point,avgs[0]) - dotproduct(point,avgs[1]) + b
	if y > 0:
		return 0
	else:
		return 1

def yesno(v):
	if v == 'yes':
		return 1
	elif v == 'no':
		return -1
	else: #for ambiguous or missing data ('i dont know')
		return 0

def matchcount(interest1, interest2):
# good way to count common items in a list
	l1 = interest1.split(':')
	l2 = interest2.split(':')
	x = 0
	for v in l1:
		if v in l2:
			x += 1
	return x

def milesdistance(a1,a2):
	return 0


loc_cache = {}
def getlocation(address):
	if address in loc_cache:
		return loc_cache[address]
	data = urlopen('http://api.local.yahoo.com/MapService/V1/'+\
				'geocode?appid=%s&location=%s' %
				(yahookey,quote_plus(address))).read()
	# quote_plus replaces spaces by plus signs
	doc = parseString(data)
	lat = doc.getElementsByTagName('Latitude')[0].firstChild.nodeValue
	lng = doc.getElementsByTagName('Longtitude')[0].firstChild.nodeValue
	loc_cache[address] = (float(lat),float(lng))
	return loc_cache[address]

# create an estimation because strict conversion
# from lat/lng to miles can be tricky
def milesdistance(a1,a2):
#  a degree of longitude is approximately 53 miles
#  a degree of latitude is approximately 69.1 miles
	lat1, lng1 = getlocation(a1)
	lat2, lng2 = getlocation(a2)
	latdif = 69.1 * (lat2-lat1)
	lngdif = 53.0 * (lng2-lng1)
	return (latdif**2 + lngdif**2)**.5


# create the new dataset to train the classifier
## Great way to transform data by applying functions
## on each
def loadnumerical():
	oldrows = loadmatch('matchmaker.csv')
	newrows = []
	for row in oldrows:
		d = row.data
		data = [float(d[0]),yesno(d[1]),yesno(d[2]),
				float(d[5]),yesno(d[6]),yesno(d[7]),
				matchcount(d[3],d[8]),
				milesdistance(d[4],d[9]),
				row.match]
		newrows.append(matchrow(data))
	return newrows

# scaling the data (age differences vs. opinions on
# children)
# scale by determining the highest/lowest value of each variable
def scaledata(rows):
	low = [999999999.0]*len(row[0].data)
	high = [-999999999.0]*len(row[0].data)
	# find highest and lowest
	## interesting way to find max/min
	for row in rows:
		d = row.data
		for i in range(len(d)):
			if d[i]<low[i]:
				low[i] = d[i]
			if d[i]>high[i]:
				high[i] = d[i]
	# create function that scales data
	def scaleinput(d):
		return [(d.data[i]-low[i])/(high[i]-low[i])
				for i in range(len(low))]

	# scale all the data
	newrows = [matchrow(scaleinput(row.data)+[row.match])
			for row in rows]
	# return new data and function
	return newrows, scaleinput

def rbf(v1,v2,gamma=20):
# similar to the dot product but it can map more complex spaces
	dv = [v1[i]-v2[i] for i in range(len(v1))]
	l = veclength(dv)
	return math.e**(-gamma*l)

def nlclassify(point,rows,offset,gamma=10):
	sum0=0.0
	sum1=0.0
	count0=0
	count1=0

	for row in rows:
		if row.match == 0:
			sum0 += rbf(point,row.data,gamma)
			count0 += 1
		else:
			sum1 += rbf(point,row.data,gamma)
			count1 +=1
	y = (1.0/count0)*sum0 -(1.0/count1)*sum1+offset
	if y <0:
		return 0
	else:
		return 1

def getoffset(rows,gamma=10):
	l0 = []
	l1 = []
	for row in rows:
		if row.match == 0:
			l0.append(row.data)
		else:
			l1.append(row.data)
	sum0 = sum(sum([rbf(v1,v2,gamma) for v1 in l0]) for v2 in l0)
	sum1 = sum(sum([rbf(v1,v2,gamma) for v1 in l1]) for v2 in l1)

	return (1.0/(len(l1)**2))*sum1-(1.0/(len(l0)**2))*sum0
	yahookey = 'cxtFNDLV34GrCw8Ns25KZt30SxLxZ85dZLUVlPCl.Gi0l.s1wrTTGuGclQK6bP9u6yeN'
	from xml.dom.minidom import parseString
	from urllib import urlopen, quote_plus

	class matchrow:
	def __init__(self,row,allnum=False):
	if allnum:
	self.data = [float(row[i]) for i in range(len(row)-1)]
	# if allnum, then get the float of all items in row
	else:
	self.data = row[0:len(row)-1]
	self.match = int(row[len(row)-1])
	# the last item in the row is 0/1, whether there
	# is a match

	def loadmatch(f,allnum=False):
	row = []
	for line in file(f):
	rows.append(matchrow(line.split(','),allnum))
	# creates a list of matchrow classes
	# with raw data and match or not
	return rows

	def lineartrain(rows):
	averages = {}
	counts = {}

	for row in rows:
	#Get the class of this point
	cl = row.match

	averages.setdefault(cl,[0.0]*(len(row.data)))
	# is this a default function? what does it do?
	# if key is in the dict, returns value
	# if not, insert key with value of default
	# returns default (None)
	counts.setdefault(cl,0)

	# Add this point to the averages
	for i in range(len(row.data)):
	average[cl][i] += float(row.data[i])

	# Keep track of counts of points
	counts[cl] += 1

	# Divide sums by counts to get averages
	for cl, avg in averages.items():
	# iterate through the key/value pairs of dict
	for i in range(len(avg)):
	avg[i] /= counts[cl]

	return averages

	def dotproduct(v1,v2):
	return sum([v1[i]*v2[i] for i in range(len(v1))])

	def dpclassify(point,avgs):
	b = (dotproduct(avgs[1],avgs[1]) - dotproduct(avgs[0],avgs[0]))/2
	y = dotproduct(point,avgs[0]) - dotproduct(point,avgs[1]) + b
	if y > 0:
	return 0
	else:
	return 1

	def yesno(v):
	if v == 'yes':
	return 1
	elif v == 'no':
	return -1
	else: #for ambiguous or missing data ('i dont know')
	return 0

	def matchcount(interest1, interest2):
	# good way to count common items in a list
	l1 = interest1.split(':')
	l2 = interest2.split(':')
	x = 0
	for v in l1:
	if v in l2:
	x += 1
	return x

	def milesdistance(a1,a2):
	return 0


	loc_cache = {}
	def getlocation(address):
	if address in loc_cache:
	return loc_cache[address]
	data = urlopen('http://api.local.yahoo.com/MapService/V1/'+\
	'geocode?appid=%s&location=%s' %
	(yahookey,quote_plus(address))).read()
	# quote_plus replaces spaces by plus signs
	doc = parseString(data)
	lat = doc.getElementsByTagName('Latitude')[0].firstChild.nodeValue
	lng = doc.getElementsByTagName('Longtitude')[0].firstChild.nodeValue
	loc_cache[address] = (float(lat),float(lng))
	return loc_cache[address]

	# create an estimation because strict conversion
	# from lat/lng to miles can be tricky
	def milesdistance(a1,a2):
	# a degree of longitude is approximately 53 miles
	# a degree of latitude is approximately 69.1 miles
	lat1, lng1 = getlocation(a1)
	lat2, lng2 = getlocation(a2)
	latdif = 69.1 * (lat2-lat1)
	lngdif = 53.0 * (lng2-lng1)
	return (latdif2 + lngdif2)**.5


	# create the new dataset to train the classifier
	## Great way to transform data by applying functions
	## on each
	def loadnumerical():
	oldrows = loadmatch('matchmaker.csv')
	newrows = []
	for row in oldrows:
	d = row.data
	data = [float(d[0]),yesno(d[1]),yesno(d[2]),
	float(d[5]),yesno(d[6]),yesno(d[7]),
	matchcount(d[3],d[8]),
	milesdistance(d[4],d[9]),
	row.match]
	newrows.append(matchrow(data))
	return newrows

	# scaling the data (age differences vs. opinions on
	# children)
	# scale by determining the highest/lowest value of each variable
	def scaledata(rows):
	low = [999999999.0]*len(row[0].data)
	high = [-999999999.0]*len(row[0].data)
	# find highest and lowest
	## interesting way to find max/min
	for row in rows:
	d = row.data
	for i in range(len(d)):
	if d[i]<low[i]:
	low[i] = d[i]
	if d[i]>high[i]:
	high[i] = d[i]
	# create function that scales data
	def scaleinput(d):
	return [(d.data[i]-low[i])/(high[i]-low[i])
	for i in range(len(low))]

	# scale all the data
	newrows = [matchrow(scaleinput(row.data)+[row.match])
	for row in rows]
	# return new data and function
	return newrows, scaleinput

	def rbf(v1,v2,gamma=20):
	# similar to the dot product but it can map more complex spaces
	dv = [v1[i]-v2[i] for i in range(len(v1))]
	l = veclength(dv)
	return math.e*(-gammal)

	def nlclassify(point,rows,offset,gamma=10):
	sum0=0.0
	sum1=0.0
	count0=0
	count1=0

	for row in rows:
	if row.match == 0:
	sum0 += rbf(point,row.data,gamma)
	count0 += 1
	else:
	sum1 += rbf(point,row.data,gamma)
	count1 +=1
	y = (1.0/count0)sum0 -(1.0/count1)sum1+offset
	if y <0:
	return 0
	else:
	return 1

	def getoffset(rows,gamma=10):
	l0 = []
	l1 = []
	for row in rows:
	if row.match == 0:
	l0.append(row.data)
	else:
	l1.append(row.data)
	sum0 = sum(sum([rbf(v1,v2,gamma) for v1 in l0]) for v2 in l0)
	sum1 = sum(sum([rbf(v1,v2,gamma) for v1 in l1]) for v2 in l1)

	return (1.0/(len(l1)*2))sum1-(1.0/(len(l0)*2))sum0