Skip to content

Instantly share code, notes, and snippets.

@jbeluch
Created March 1, 2011 22:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jbeluch/850010 to your computer and use it in GitHub Desktop.
Save jbeluch/850010 to your computer and use it in GitHub Desktop.
Scrapes movies listed on classiccinemaonline.com and creates a histogram of movies per year.
#!/usr/bin/env python
'''
Jonathan Beluch - web@jonathanbeluch.com
Scrapes movie titles from http://www.classiccinemaonline.com and
creates a histogram showing the distribution of titles by year.
Requires BeautifulSoup, matplotlib and numpy.
'''
from BeautifulSoup import BeautifulSoup as BS
from urlparse import urljoin, parse_qs
from urllib import urlencode
from itertools import chain
import urllib2
import re
BASE_URL = 'http://www.classiccinemaonline.com'
HOMEPAGE_URL = 'http://www.classiccinemaonline.com/1/index.php'
CHART_FN = 'histogram.png'
TITLES_FN = 'movie_titles'
def get_page(url, data=None):
u = urllib2.urlopen(url, data)
src = u.read()
u.close()
return src
def get_genre_urls(url):
src = get_page(url)
#fix terrible html so beautiful soup doesn't barf
src = src.replace('</font color>', '</font>')
src = src.replace(r'<ol class=\"latestnews \">', '<ol class="latestnews">')
html = BS(src)
# <a> tag class names aren't consistent, so grab the 'rightcol' div and
# parse the child <a> tags
div = html.find('div', {'id': 'rightcol'})
return [urljoin(BASE_URL, a['href']) for a in div.ul.findAll('a')]
def parse_movies(url):
_, qs = url.split('?', 1)
params = parse_qs(qs)
data = {'id': params['id'][0], 'limit': '0'}
src = get_page(url, urlencode(data))
html = BS(src)
print 'Parsing movies from \'%s\'' % html.title.string.strip()
tr_tags = html.findAll('tr', {'class': re.compile('sectiontableentry')})
return [tr.a.string.strip() for tr in tr_tags]
p = re.compile(r'\((\d{4})\)')
def get_year(title):
m = p.search(title)
if m:
return int(m.group(1))
print "Warning: No year match for '%s'." % title
def print_to_file(items, fn):
with open(fn, 'w') as f:
f.writelines((str(i) + '\n' for i in items))
def get_from_file(fn):
with open(fn) as f:
lines = map(str.strip, f.readlines())
return lines
def create_plot(years):
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
# Set up the figure and main plot
fig = plt.figure()
ax= fig.add_subplot(111)
# Calculate the range so we can ensure each bin covers a 1 year period
bin_range = max(years) - min(years)
n, bins, patches = ax.hist(years, bins=bin_range, align='left')
# Change major axes tickers to multiples of 5 instead of default 10
ax.xaxis.set_major_locator(MultipleLocator(5))
ax.yaxis.set_major_locator(MultipleLocator(5))
ax.grid(True)
# Set up axes labels and plot title
ax.set_ylabel('Number of Movies + TV Shows')
ax.set_xlabel('Year')
ax.set_title('Number of Movies and TV Shows per Year')
# Enlarge the output figure so things aren't as cramped
fig = plt.gcf()
# Defaults is (8, 6)
fig.set_size_inches(12, 9)
# Save output
print "Saving histogram to '%s' ..." % CHART_FN
plt.savefig(CHART_FN)
if __name__ == '__main__':
print 'Downloading genres...'
genre_urls = get_genre_urls(HOMEPAGE_URL)
print 'Downloading movie titles...'
moviess = map(parse_movies, genre_urls)
# flatten list of lists
titles = list(chain(*moviess))
# Write movie titles to file in case you want to run again without
# downloading everything
print "Saving movie titles to '%s' ..." % TITLES_FN
print_to_file(titles, TITLES_FN)
# If you want to rerun without downloading everything, comment out above
# and start here:
#titles = get_from_file(TITLES_FN)
# parse year from title strings
print 'Parsing years...'
years = map(get_year, titles)
# filter out non-matches
years = filter(None, years)
# build and save the graph
create_plot(years)
print 'Done.'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment