Skip to content

Instantly share code, notes, and snippets.

@jackschultz
Created June 7, 2015 19:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jackschultz/38c8462d8c3b6d74f422 to your computer and use it in GitHub Desktop.
Save jackschultz/38c8462d8c3b6d74f422 to your computer and use it in GitHub Desktop.
Analysis of nobel prize winners and their ages
from bs4 import BeautifulSoup
import unicodedata
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from scipy.stats import norm
class Prize:
def __init__(self, name, age, year, prize_type, description):
self.name = unicodedata.normalize('NFKD', name).encode('ascii','ignore') #umlaut issues
self.age = age
self.year = year
self.prize_type = prize_type
self.description = description
def __str__(self):
return self.name + ' won ' + str(self.prize_type) + ' at age ' + str(self.age) + ' in ' + str(self.year)
f = open('nobel_laureates_by_age.html', 'r')
html = BeautifulSoup(f.read())
winners = []
prize_types = set()
nobel_prize_string = "The Nobel Prize in "
for tag in html.find("div", id="nobel-age-info").children:
# we're looking for a specific div, that doesn't have a class, id, or anything noteworthy
#so I'm going to count the divs that are in this outerdiv until we hit the one I want
if tag.name == None:
next
elif tag.name == 'h3':
current_age = int(tag.text.split(" ")[-1]) #update the age
elif tag.name == 'div':
name = tag.find("h6").text #winner's name
description = tag.find_all("p")[0].find("a").text #winner's name
year = int(description.split(' ')[-1])
prize_type = ' '.join(description.split(' ')[0:-1])
prize_types.add(prize_type)
prize = Prize(name, current_age, int(year), prize_type)
winners.append(prize)
all_prize_string = "All Prizes"
ts = list(prize_types)
ts.append(all_prize_string) #want to get all prizes too
print "Type, Number of Winners, Mean Age, Variance of Ages"
for prize_type in ts:
ages = [p.age for p in winners if p.prize_type == prize_type or prize_type == all_prize_string]
num_bins = ages[-1] - ages[0]
fig = plt.figure()
n, bins, patches = plt.hist(ages, num_bins, normed=1, facecolor='green', alpha=0.2)
mean, var = norm.fit(ages)
y = mlab.normpdf(bins, mean, var)
plt.plot(bins, y, 'r--')
plt.ylabel('Number of Winners')
plt.xlabel('Age')
plt.title(prize_type + '. Mean: ' + str(round(mean,2)) + ', Var: ' + str(round(var,2)))
fig.savefig('nobel_hist_' + prize_type.lower().replace(' ', '_') + '.png', dpi=500,format='png')
print prize_type +', '+ str(len(ages)) +', '+ str(round(mean,2)) +', '+ str(round(var,2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment