Created
October 26, 2011 23:47
-
-
Save danielhfrank/1318361 to your computer and use it in GitHub Desktop.
Code for GA Data Science class
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
""" | |
untitled.py | |
Created by Daniel Frank on 2011-10-24. | |
Copyright (c) 2011 __MyCompanyName__. All rights reserved. | |
""" | |
import sys | |
import os | |
import numpy as np | |
class Ethnicity: | |
def __init__(self,name): | |
self.name = name | |
self.name_freqs = {} | |
def get_normalized_name_freqs(self): | |
total = float(sum(self.name_freqs.values())) | |
normalized_name_freqs = {} | |
for name, freq in self.name_freqs.items(): | |
normalized_name_freqs[name] = freq / total | |
return normalized_name_freqs | |
class Main: | |
def __init__(self): | |
self.name_indexes = {} #this will map a name to the array index that it should occupy, for vector comparison | |
def main(self): | |
self.ethnicities = {} | |
for line in sys.stdin: | |
line = line.strip('\n') | |
columns = line.split(',') | |
eth_name = columns[2] | |
#try converting the int here to keep from getting errors on the title lines | |
try: | |
name_count = int(columns[4]) | |
except ValueError: | |
continue | |
name_value = columns[3] #the actual name in this row | |
if not eth_name in self.ethnicities: #make sure we have initialized this borough | |
self.ethnicities[eth_name] = Ethnicity(eth_name) | |
self.ethnicities[eth_name].name_freqs[name_value] = name_count # | |
self.add_name_to_index(name_value) | |
#now let's see if this worked.. | |
black = self.ethnicities['BLACK NON HISPANIC'] | |
white = self.ethnicities['WHITE NON HISPANIC'] | |
hispanic = self.ethnicities['HISPANIC'] | |
asian = self.ethnicities['ASIAN AND PACIFIC ISLANDER'] | |
print '####' | |
# print self.vectorize(black) | |
# print self.compare(black,white) | |
self.print_closest(black) | |
self.print_closest(white) | |
self.print_closest(hispanic) | |
self.print_closest(asian) | |
def add_name_to_index(self, name_value): | |
if not name_value in self.name_indexes: | |
index = max(self.name_indexes.values()) + 1 if len(self.name_indexes) > 0 else 0 | |
self.name_indexes[name_value] = index | |
def compare(self,borough1,borough2): | |
vector1 = self.vectorize(borough1) | |
vector2 = self.vectorize(borough2) | |
#now compute euclidean distance between the two vectors | |
return np.linalg.norm(vector1-vector2) | |
def vectorize(self,borough): | |
vector = np.zeros(len(self.name_indexes)) #first initialize the vector with all zeros | |
for name, freq in borough.get_normalized_name_freqs().items(): | |
index = self.name_indexes[name] #get the index of the vector that corresponds to this name | |
vector[index] = freq #replace the zero with the frequency of this name | |
return vector | |
def print_closest(self, ethnicity): | |
closest = [] | |
for other_ethnicity in self.ethnicities.values(): | |
if ethnicity == other_ethnicity: continue # make sure not to compare with self... | |
closest += [(other_ethnicity.name, self.compare(ethnicity, other_ethnicity))] #store as list of tuples: (name,distance) | |
closest.sort(cmp=lambda x,y: int((x[1] - y[1])/abs(x[1]-y[1])) if x[1]-y[1] != 0 else 0) #sort by distance, low to high | |
print '*******%s************' % ethnicity.name | |
for eth in closest: | |
print '\t%s\t%f' % (eth[0],eth[1]) | |
if __name__ == '__main__': | |
Main().main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment