Skip to content

Instantly share code, notes, and snippets.

@minrk
Created June 14, 2011 21:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save minrk/1026020 to your computer and use it in GitHub Desktop.
Save minrk/1026020 to your computer and use it in GitHub Desktop.
import re
from string import uppercase
import numpy as np
from matplotlib import pyplot as plt
dt = [('name', 'a32'), ('gender', 'a1'), ('count', int), ('freq', float), ('year', int)]
def fetch_names(dest='names.zip'):
import urllib, zipfile
url = "http://www.ssa.gov/OACT/babynames/names.zip"
try:
z = zipfile.ZipFile(dest)
except Exception:
urllib.urlretrieve(url, dest)
z = zipfile.ZipFile(dest)
return z
def parse_fp(fp):
data = []
for line in fp.readlines():
if not line.strip():
continue
name, gender, count = line.strip().split(',')
data.append((name, gender, count, 0, 0))
A = np.recarray(len(data), dtype=dt)
A[:] = data
Fmask = A['gender'] == 'F'
A['freq'][Fmask] = A['count'][Fmask]/(A['count'][Fmask].sum()*1.)
Mmask = A['gender'] == 'M'
A['freq'][Mmask] = A['count'][Mmask]/(A['count'][Mmask].sum()*1.)
A.sort(order=['count'])
# A['freq'] = A['count']/(A['count'].sum()*1.)
return A
def fetch_and_parse_all(fname='names.zip'):
z = fetch_names(fname)
data = []
N = 0
for yob in z.namelist():
year = int(re.search(r'\d+',yob).group())
# years.append(year)
print year
A = parse_fp(z.open(yob))
A['year'] = year
data.append(A)
N += len(A)
A = np.recarray(N, dtype=dt)
last = 0
for yob in data:
n = len(yob)
A[last:last+n] = yob
last += n
return A
def plot_name(data, name, gender=None, scale=True):
mask = (data['name'] == name)
if gender:
mask = mask * (data['gender'] == gender)
masked = data[mask]
masked.sort(order='year')
if scale:
line = 100*masked['freq']
else:
line = masked['count']
plt.plot(masked['year'],line)
return masked
def plot_names(data, names, genders=None, scale=True):
plt.figure()
if genders is None or len(genders) == 1:
genders = [genders]*len(names)
for name, gender in zip(names, genders):
plot_name(data, name, gender, scale)
plt.legend(names, loc=0).set_alpha(0.8)
def parse(yobfile):
data = []
with open(yobfile) as f:
return parse_fp(f)
def counts(A):
sums = numpy.recarray(26, dtype=dt)
# B['first'][:] = uppercase
for i,c in enumerate(uppercase):
mask = (A['first'] == c)
total = (mask*A['freq']).sum()
sums[i] = (c,total)
return sums
def plot_counts(C, color='b', title=''):
plt.title(title)
plt.bar(range(26), 100*C['freq'], color=color)
plt.xticks(numpy.arange(26)+.4, uppercase)
plt.xlim(0,27)
def print_highest(C, n=5):
C = C.copy()
C.sort(order='freq')
for c,f in C[-n:][::-1]:
print c,f
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment