Created
June 14, 2011 21:57
-
-
Save minrk/1026020 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from string import uppercase | |
import numpy as np | |
from matplotlib import pyplot as plt | |
dt = [('name', 'a32'), ('gender', 'a1'), ('count', int), ('freq', float), ('year', int)] | |
def fetch_names(dest='names.zip'): | |
import urllib, zipfile | |
url = "http://www.ssa.gov/OACT/babynames/names.zip" | |
try: | |
z = zipfile.ZipFile(dest) | |
except Exception: | |
urllib.urlretrieve(url, dest) | |
z = zipfile.ZipFile(dest) | |
return z | |
def parse_fp(fp): | |
data = [] | |
for line in fp.readlines(): | |
if not line.strip(): | |
continue | |
name, gender, count = line.strip().split(',') | |
data.append((name, gender, count, 0, 0)) | |
A = np.recarray(len(data), dtype=dt) | |
A[:] = data | |
Fmask = A['gender'] == 'F' | |
A['freq'][Fmask] = A['count'][Fmask]/(A['count'][Fmask].sum()*1.) | |
Mmask = A['gender'] == 'M' | |
A['freq'][Mmask] = A['count'][Mmask]/(A['count'][Mmask].sum()*1.) | |
A.sort(order=['count']) | |
# A['freq'] = A['count']/(A['count'].sum()*1.) | |
return A | |
def fetch_and_parse_all(fname='names.zip'): | |
z = fetch_names(fname) | |
data = [] | |
N = 0 | |
for yob in z.namelist(): | |
year = int(re.search(r'\d+',yob).group()) | |
# years.append(year) | |
print year | |
A = parse_fp(z.open(yob)) | |
A['year'] = year | |
data.append(A) | |
N += len(A) | |
A = np.recarray(N, dtype=dt) | |
last = 0 | |
for yob in data: | |
n = len(yob) | |
A[last:last+n] = yob | |
last += n | |
return A | |
def plot_name(data, name, gender=None, scale=True): | |
mask = (data['name'] == name) | |
if gender: | |
mask = mask * (data['gender'] == gender) | |
masked = data[mask] | |
masked.sort(order='year') | |
if scale: | |
line = 100*masked['freq'] | |
else: | |
line = masked['count'] | |
plt.plot(masked['year'],line) | |
return masked | |
def plot_names(data, names, genders=None, scale=True): | |
plt.figure() | |
if genders is None or len(genders) == 1: | |
genders = [genders]*len(names) | |
for name, gender in zip(names, genders): | |
plot_name(data, name, gender, scale) | |
plt.legend(names, loc=0).set_alpha(0.8) | |
def parse(yobfile): | |
data = [] | |
with open(yobfile) as f: | |
return parse_fp(f) | |
def counts(A): | |
sums = numpy.recarray(26, dtype=dt) | |
# B['first'][:] = uppercase | |
for i,c in enumerate(uppercase): | |
mask = (A['first'] == c) | |
total = (mask*A['freq']).sum() | |
sums[i] = (c,total) | |
return sums | |
def plot_counts(C, color='b', title=''): | |
plt.title(title) | |
plt.bar(range(26), 100*C['freq'], color=color) | |
plt.xticks(numpy.arange(26)+.4, uppercase) | |
plt.xlim(0,27) | |
def print_highest(C, n=5): | |
C = C.copy() | |
C.sort(order='freq') | |
for c,f in C[-n:][::-1]: | |
print c,f |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment