danielhfrank/nycnames.py

## nycnames.py
#!/usr/bin/env python
# encoding: utf-8
"""
untitled.py

Created by Daniel Frank on 2011-10-24.
Copyright (c) 2011 __MyCompanyName__. All rights reserved.
"""

import sys
import os
import numpy as np

class Ethnicity:

	def __init__(self,name):
		self.name = name
		self.name_freqs = {}

	def get_normalized_name_freqs(self):
		total = float(sum(self.name_freqs.values()))
		normalized_name_freqs = {}
		for name, freq in self.name_freqs.items():
			normalized_name_freqs[name] = freq / total
		return normalized_name_freqs

class Main:

	def __init__(self):
		self.name_indexes = {} #this will map a name to the array index that it should occupy, for vector comparison

	def main(self):
		self.ethnicities = {}
		for line in sys.stdin:
			line = line.strip('\n')
			columns = line.split(',')

			eth_name = columns[2]
			#try converting the int here to keep from getting errors on the title lines
			try:
				name_count = int(columns[4])
			except ValueError:
				continue

			name_value = columns[3] #the actual name in this row

			if not eth_name in self.ethnicities: #make sure we have initialized this borough
				self.ethnicities[eth_name] = Ethnicity(eth_name)

			self.ethnicities[eth_name].name_freqs[name_value] = name_count #

			self.add_name_to_index(name_value)

		#now let's see if this worked..
		black = self.ethnicities['BLACK NON HISPANIC']
		white = self.ethnicities['WHITE NON HISPANIC']
		hispanic = self.ethnicities['HISPANIC']
		asian = self.ethnicities['ASIAN AND PACIFIC ISLANDER']
		print '####'
		# print self.vectorize(black)
		# print self.compare(black,white)
		self.print_closest(black)
		self.print_closest(white)
		self.print_closest(hispanic)
		self.print_closest(asian)

	def add_name_to_index(self, name_value):
		if not name_value in self.name_indexes:
			index = max(self.name_indexes.values()) + 1 if len(self.name_indexes) > 0 else 0
			self.name_indexes[name_value] = index

	def compare(self,borough1,borough2):
		vector1 = self.vectorize(borough1)
		vector2 = self.vectorize(borough2)
		#now compute euclidean distance between the two vectors
		return np.linalg.norm(vector1-vector2)

	def vectorize(self,borough):
		vector = np.zeros(len(self.name_indexes)) #first initialize the vector with all zeros
		for name, freq in borough.get_normalized_name_freqs().items():
			index = self.name_indexes[name] #get the index of the vector that corresponds to this name
			vector[index] = freq #replace the zero with the frequency of this name
		return vector

	def print_closest(self, ethnicity):
		closest = []
		for other_ethnicity in self.ethnicities.values():
			if ethnicity == other_ethnicity: continue # make sure not to compare with self...
			closest += [(other_ethnicity.name, self.compare(ethnicity, other_ethnicity))] #store as list of tuples: (name,distance)
		closest.sort(cmp=lambda x,y: int((x[1] - y[1])/abs(x[1]-y[1])) if x[1]-y[1] != 0 else 0) #sort by distance, low to high
		print '*******%s************' % ethnicity.name
		for eth in closest:
			print '\t%s\t%f' % (eth[0],eth[1])


if __name__ == '__main__':
	Main().main()
	#!/usr/bin/env python
	# encoding: utf-8
	"""
	untitled.py

	Created by Daniel Frank on 2011-10-24.
	Copyright (c) 2011 __MyCompanyName__. All rights reserved.
	"""

	import sys
	import os
	import numpy as np

	class Ethnicity:

	def __init__(self,name):
	self.name = name
	self.name_freqs = {}

	def get_normalized_name_freqs(self):
	total = float(sum(self.name_freqs.values()))
	normalized_name_freqs = {}
	for name, freq in self.name_freqs.items():
	normalized_name_freqs[name] = freq / total
	return normalized_name_freqs

	class Main:

	def __init__(self):
	self.name_indexes = {} #this will map a name to the array index that it should occupy, for vector comparison

	def main(self):
	self.ethnicities = {}
	for line in sys.stdin:
	line = line.strip('\n')
	columns = line.split(',')

	eth_name = columns[2]
	#try converting the int here to keep from getting errors on the title lines
	try:
	name_count = int(columns[4])
	except ValueError:
	continue

	name_value = columns[3] #the actual name in this row

	if not eth_name in self.ethnicities: #make sure we have initialized this borough
	self.ethnicities[eth_name] = Ethnicity(eth_name)

	self.ethnicities[eth_name].name_freqs[name_value] = name_count #

	self.add_name_to_index(name_value)

	#now let's see if this worked..
	black = self.ethnicities['BLACK NON HISPANIC']
	white = self.ethnicities['WHITE NON HISPANIC']
	hispanic = self.ethnicities['HISPANIC']
	asian = self.ethnicities['ASIAN AND PACIFIC ISLANDER']
	print '####'
	# print self.vectorize(black)
	# print self.compare(black,white)
	self.print_closest(black)
	self.print_closest(white)
	self.print_closest(hispanic)
	self.print_closest(asian)

	def add_name_to_index(self, name_value):
	if not name_value in self.name_indexes:
	index = max(self.name_indexes.values()) + 1 if len(self.name_indexes) > 0 else 0
	self.name_indexes[name_value] = index

	def compare(self,borough1,borough2):
	vector1 = self.vectorize(borough1)
	vector2 = self.vectorize(borough2)
	#now compute euclidean distance between the two vectors
	return np.linalg.norm(vector1-vector2)

	def vectorize(self,borough):
	vector = np.zeros(len(self.name_indexes)) #first initialize the vector with all zeros
	for name, freq in borough.get_normalized_name_freqs().items():
	index = self.name_indexes[name] #get the index of the vector that corresponds to this name
	vector[index] = freq #replace the zero with the frequency of this name
	return vector

	def print_closest(self, ethnicity):
	closest = []
	for other_ethnicity in self.ethnicities.values():
	if ethnicity == other_ethnicity: continue # make sure not to compare with self...
	closest += [(other_ethnicity.name, self.compare(ethnicity, other_ethnicity))] #store as list of tuples: (name,distance)
	closest.sort(cmp=lambda x,y: int((x[1] - y[1])/abs(x[1]-y[1])) if x[1]-y[1] != 0 else 0) #sort by distance, low to high
	print '*****%s**********' % ethnicity.name
	for eth in closest:
	print '\t%s\t%f' % (eth[0],eth[1])


	if __name__ == '__main__':
	Main().main()