Create a gist now

Instantly share code, notes, and snippets.

Scrapes movies listed on and creates a histogram of movies per year.
#!/usr/bin/env python
Jonathan Beluch -
Scrapes movie titles from and
creates a histogram showing the distribution of titles by year.
Requires BeautifulSoup, matplotlib and numpy.
from BeautifulSoup import BeautifulSoup as BS
from urlparse import urljoin, parse_qs
from urllib import urlencode
from itertools import chain
import urllib2
import re
CHART_FN = 'histogram.png'
TITLES_FN = 'movie_titles'
def get_page(url, data=None):
u = urllib2.urlopen(url, data)
src =
return src
def get_genre_urls(url):
src = get_page(url)
#fix terrible html so beautiful soup doesn't barf
src = src.replace('</font color>', '</font>')
src = src.replace(r'<ol class=\"latestnews \">', '<ol class="latestnews">')
html = BS(src)
# <a> tag class names aren't consistent, so grab the 'rightcol' div and
# parse the child <a> tags
div = html.find('div', {'id': 'rightcol'})
return [urljoin(BASE_URL, a['href']) for a in div.ul.findAll('a')]
def parse_movies(url):
_, qs = url.split('?', 1)
params = parse_qs(qs)
data = {'id': params['id'][0], 'limit': '0'}
src = get_page(url, urlencode(data))
html = BS(src)
print 'Parsing movies from \'%s\'' % html.title.string.strip()
tr_tags = html.findAll('tr', {'class': re.compile('sectiontableentry')})
return [tr.a.string.strip() for tr in tr_tags]
p = re.compile(r'\((\d{4})\)')
def get_year(title):
m =
if m:
return int(
print "Warning: No year match for '%s'." % title
def print_to_file(items, fn):
with open(fn, 'w') as f:
f.writelines((str(i) + '\n' for i in items))
def get_from_file(fn):
with open(fn) as f:
lines = map(str.strip, f.readlines())
return lines
def create_plot(years):
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
# Set up the figure and main plot
fig = plt.figure()
ax= fig.add_subplot(111)
# Calculate the range so we can ensure each bin covers a 1 year period
bin_range = max(years) - min(years)
n, bins, patches = ax.hist(years, bins=bin_range, align='left')
# Change major axes tickers to multiples of 5 instead of default 10
# Set up axes labels and plot title
ax.set_ylabel('Number of Movies + TV Shows')
ax.set_title('Number of Movies and TV Shows per Year')
# Enlarge the output figure so things aren't as cramped
fig = plt.gcf()
# Defaults is (8, 6)
fig.set_size_inches(12, 9)
# Save output
print "Saving histogram to '%s' ..." % CHART_FN
if __name__ == '__main__':
print 'Downloading genres...'
genre_urls = get_genre_urls(HOMEPAGE_URL)
print 'Downloading movie titles...'
moviess = map(parse_movies, genre_urls)
# flatten list of lists
titles = list(chain(*moviess))
# Write movie titles to file in case you want to run again without
# downloading everything
print "Saving movie titles to '%s' ..." % TITLES_FN
print_to_file(titles, TITLES_FN)
# If you want to rerun without downloading everything, comment out above
# and start here:
#titles = get_from_file(TITLES_FN)
# parse year from title strings
print 'Parsing years...'
years = map(get_year, titles)
# filter out non-matches
years = filter(None, years)
# build and save the graph
print 'Done.'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment