Skip to content

Instantly share code, notes, and snippets.

@danielhfrank
Created October 26, 2011 23:47
Show Gist options
  • Save danielhfrank/1318361 to your computer and use it in GitHub Desktop.
Save danielhfrank/1318361 to your computer and use it in GitHub Desktop.
Code for GA Data Science class
#!/usr/bin/env python
# encoding: utf-8
"""
untitled.py
Created by Daniel Frank on 2011-10-24.
Copyright (c) 2011 __MyCompanyName__. All rights reserved.
"""
import sys
import os
import numpy as np
class Ethnicity:
def __init__(self,name):
self.name = name
self.name_freqs = {}
def get_normalized_name_freqs(self):
total = float(sum(self.name_freqs.values()))
normalized_name_freqs = {}
for name, freq in self.name_freqs.items():
normalized_name_freqs[name] = freq / total
return normalized_name_freqs
class Main:
def __init__(self):
self.name_indexes = {} #this will map a name to the array index that it should occupy, for vector comparison
def main(self):
self.ethnicities = {}
for line in sys.stdin:
line = line.strip('\n')
columns = line.split(',')
eth_name = columns[2]
#try converting the int here to keep from getting errors on the title lines
try:
name_count = int(columns[4])
except ValueError:
continue
name_value = columns[3] #the actual name in this row
if not eth_name in self.ethnicities: #make sure we have initialized this borough
self.ethnicities[eth_name] = Ethnicity(eth_name)
self.ethnicities[eth_name].name_freqs[name_value] = name_count #
self.add_name_to_index(name_value)
#now let's see if this worked..
black = self.ethnicities['BLACK NON HISPANIC']
white = self.ethnicities['WHITE NON HISPANIC']
hispanic = self.ethnicities['HISPANIC']
asian = self.ethnicities['ASIAN AND PACIFIC ISLANDER']
print '####'
# print self.vectorize(black)
# print self.compare(black,white)
self.print_closest(black)
self.print_closest(white)
self.print_closest(hispanic)
self.print_closest(asian)
def add_name_to_index(self, name_value):
if not name_value in self.name_indexes:
index = max(self.name_indexes.values()) + 1 if len(self.name_indexes) > 0 else 0
self.name_indexes[name_value] = index
def compare(self,borough1,borough2):
vector1 = self.vectorize(borough1)
vector2 = self.vectorize(borough2)
#now compute euclidean distance between the two vectors
return np.linalg.norm(vector1-vector2)
def vectorize(self,borough):
vector = np.zeros(len(self.name_indexes)) #first initialize the vector with all zeros
for name, freq in borough.get_normalized_name_freqs().items():
index = self.name_indexes[name] #get the index of the vector that corresponds to this name
vector[index] = freq #replace the zero with the frequency of this name
return vector
def print_closest(self, ethnicity):
closest = []
for other_ethnicity in self.ethnicities.values():
if ethnicity == other_ethnicity: continue # make sure not to compare with self...
closest += [(other_ethnicity.name, self.compare(ethnicity, other_ethnicity))] #store as list of tuples: (name,distance)
closest.sort(cmp=lambda x,y: int((x[1] - y[1])/abs(x[1]-y[1])) if x[1]-y[1] != 0 else 0) #sort by distance, low to high
print '*******%s************' % ethnicity.name
for eth in closest:
print '\t%s\t%f' % (eth[0],eth[1])
if __name__ == '__main__':
Main().main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment