Skip to content

Instantly share code, notes, and snippets.

@jbeluch jbeluch/
Created Mar 1, 2011

What would you like to do?
Scrapes movies listed on and creates a histogram of movies per year.
#!/usr/bin/env python
Jonathan Beluch -
Scrapes movie titles from and
creates a histogram showing the distribution of titles by year.
Requires BeautifulSoup, matplotlib and numpy.
from BeautifulSoup import BeautifulSoup as BS
from urlparse import urljoin, parse_qs
from urllib import urlencode
from itertools import chain
import urllib2
import re
CHART_FN = 'histogram.png'
TITLES_FN = 'movie_titles'
def get_page(url, data=None):
u = urllib2.urlopen(url, data)
src =
return src
def get_genre_urls(url):
src = get_page(url)
#fix terrible html so beautiful soup doesn't barf
src = src.replace('</font color>', '</font>')
src = src.replace(r'<ol class=\"latestnews \">', '<ol class="latestnews">')
html = BS(src)
# <a> tag class names aren't consistent, so grab the 'rightcol' div and
# parse the child <a> tags
div = html.find('div', {'id': 'rightcol'})
return [urljoin(BASE_URL, a['href']) for a in div.ul.findAll('a')]
def parse_movies(url):
_, qs = url.split('?', 1)
params = parse_qs(qs)
data = {'id': params['id'][0], 'limit': '0'}
src = get_page(url, urlencode(data))
html = BS(src)
print 'Parsing movies from \'%s\'' % html.title.string.strip()
tr_tags = html.findAll('tr', {'class': re.compile('sectiontableentry')})
return [tr.a.string.strip() for tr in tr_tags]
p = re.compile(r'\((\d{4})\)')
def get_year(title):
m =
if m:
return int(
print "Warning: No year match for '%s'." % title
def print_to_file(items, fn):
with open(fn, 'w') as f:
f.writelines((str(i) + '\n' for i in items))
def get_from_file(fn):
with open(fn) as f:
lines = map(str.strip, f.readlines())
return lines
def create_plot(years):
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
# Set up the figure and main plot
fig = plt.figure()
ax= fig.add_subplot(111)
# Calculate the range so we can ensure each bin covers a 1 year period
bin_range = max(years) - min(years)
n, bins, patches = ax.hist(years, bins=bin_range, align='left')
# Change major axes tickers to multiples of 5 instead of default 10
# Set up axes labels and plot title
ax.set_ylabel('Number of Movies + TV Shows')
ax.set_title('Number of Movies and TV Shows per Year')
# Enlarge the output figure so things aren't as cramped
fig = plt.gcf()
# Defaults is (8, 6)
fig.set_size_inches(12, 9)
# Save output
print "Saving histogram to '%s' ..." % CHART_FN
if __name__ == '__main__':
print 'Downloading genres...'
genre_urls = get_genre_urls(HOMEPAGE_URL)
print 'Downloading movie titles...'
moviess = map(parse_movies, genre_urls)
# flatten list of lists
titles = list(chain(*moviess))
# Write movie titles to file in case you want to run again without
# downloading everything
print "Saving movie titles to '%s' ..." % TITLES_FN
print_to_file(titles, TITLES_FN)
# If you want to rerun without downloading everything, comment out above
# and start here:
#titles = get_from_file(TITLES_FN)
# parse year from title strings
print 'Parsing years...'
years = map(get_year, titles)
# filter out non-matches
years = filter(None, years)
# build and save the graph
print 'Done.'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.