Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Analysis of nobel prize winners and their ages
from bs4 import BeautifulSoup
import unicodedata
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from scipy.stats import norm
class Prize:
def __init__(self, name, age, year, prize_type, description):
self.name = unicodedata.normalize('NFKD', name).encode('ascii','ignore') #umlaut issues
self.age = age
self.year = year
self.prize_type = prize_type
self.description = description
def __str__(self):
return self.name + ' won ' + str(self.prize_type) + ' at age ' + str(self.age) + ' in ' + str(self.year)
f = open('nobel_laureates_by_age.html', 'r')
html = BeautifulSoup(f.read())
winners = []
prize_types = set()
nobel_prize_string = "The Nobel Prize in "
for tag in html.find("div", id="nobel-age-info").children:
# we're looking for a specific div, that doesn't have a class, id, or anything noteworthy
#so I'm going to count the divs that are in this outerdiv until we hit the one I want
if tag.name == None:
next
elif tag.name == 'h3':
current_age = int(tag.text.split(" ")[-1]) #update the age
elif tag.name == 'div':
name = tag.find("h6").text #winner's name
description = tag.find_all("p")[0].find("a").text #winner's name
year = int(description.split(' ')[-1])
prize_type = ' '.join(description.split(' ')[0:-1])
prize_types.add(prize_type)
prize = Prize(name, current_age, int(year), prize_type)
winners.append(prize)
all_prize_string = "All Prizes"
ts = list(prize_types)
ts.append(all_prize_string) #want to get all prizes too
print "Type, Number of Winners, Mean Age, Variance of Ages"
for prize_type in ts:
ages = [p.age for p in winners if p.prize_type == prize_type or prize_type == all_prize_string]
num_bins = ages[-1] - ages[0]
fig = plt.figure()
n, bins, patches = plt.hist(ages, num_bins, normed=1, facecolor='green', alpha=0.2)
mean, var = norm.fit(ages)
y = mlab.normpdf(bins, mean, var)
plt.plot(bins, y, 'r--')
plt.ylabel('Number of Winners')
plt.xlabel('Age')
plt.title(prize_type + '. Mean: ' + str(round(mean,2)) + ', Var: ' + str(round(var,2)))
fig.savefig('nobel_hist_' + prize_type.lower().replace(' ', '_') + '.png', dpi=500,format='png')
print prize_type +', '+ str(len(ages)) +', '+ str(round(mean,2)) +', '+ str(round(var,2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.