Scrapes movies listed on classiccinemaonline.com and creates a histogram of movies per year.
#!/usr/bin/env python | |
''' | |
Jonathan Beluch - web@jonathanbeluch.com | |
Scrapes movie titles from http://www.classiccinemaonline.com and | |
creates a histogram showing the distribution of titles by year. | |
Requires BeautifulSoup, matplotlib and numpy. | |
''' | |
from BeautifulSoup import BeautifulSoup as BS | |
from urlparse import urljoin, parse_qs | |
from urllib import urlencode | |
from itertools import chain | |
import urllib2 | |
import re | |
BASE_URL = 'http://www.classiccinemaonline.com' | |
HOMEPAGE_URL = 'http://www.classiccinemaonline.com/1/index.php' | |
CHART_FN = 'histogram.png' | |
TITLES_FN = 'movie_titles' | |
def get_page(url, data=None): | |
u = urllib2.urlopen(url, data) | |
src = u.read() | |
u.close() | |
return src | |
def get_genre_urls(url): | |
src = get_page(url) | |
#fix terrible html so beautiful soup doesn't barf | |
src = src.replace('</font color>', '</font>') | |
src = src.replace(r'<ol class=\"latestnews \">', '<ol class="latestnews">') | |
html = BS(src) | |
# <a> tag class names aren't consistent, so grab the 'rightcol' div and | |
# parse the child <a> tags | |
div = html.find('div', {'id': 'rightcol'}) | |
return [urljoin(BASE_URL, a['href']) for a in div.ul.findAll('a')] | |
def parse_movies(url): | |
_, qs = url.split('?', 1) | |
params = parse_qs(qs) | |
data = {'id': params['id'][0], 'limit': '0'} | |
src = get_page(url, urlencode(data)) | |
html = BS(src) | |
print 'Parsing movies from \'%s\'' % html.title.string.strip() | |
tr_tags = html.findAll('tr', {'class': re.compile('sectiontableentry')}) | |
return [tr.a.string.strip() for tr in tr_tags] | |
p = re.compile(r'\((\d{4})\)') | |
def get_year(title): | |
m = p.search(title) | |
if m: | |
return int(m.group(1)) | |
print "Warning: No year match for '%s'." % title | |
def print_to_file(items, fn): | |
with open(fn, 'w') as f: | |
f.writelines((str(i) + '\n' for i in items)) | |
def get_from_file(fn): | |
with open(fn) as f: | |
lines = map(str.strip, f.readlines()) | |
return lines | |
def create_plot(years): | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from matplotlib.ticker import MultipleLocator | |
# Set up the figure and main plot | |
fig = plt.figure() | |
ax= fig.add_subplot(111) | |
# Calculate the range so we can ensure each bin covers a 1 year period | |
bin_range = max(years) - min(years) | |
n, bins, patches = ax.hist(years, bins=bin_range, align='left') | |
# Change major axes tickers to multiples of 5 instead of default 10 | |
ax.xaxis.set_major_locator(MultipleLocator(5)) | |
ax.yaxis.set_major_locator(MultipleLocator(5)) | |
ax.grid(True) | |
# Set up axes labels and plot title | |
ax.set_ylabel('Number of Movies + TV Shows') | |
ax.set_xlabel('Year') | |
ax.set_title('Number of Movies and TV Shows per Year') | |
# Enlarge the output figure so things aren't as cramped | |
fig = plt.gcf() | |
# Defaults is (8, 6) | |
fig.set_size_inches(12, 9) | |
# Save output | |
print "Saving histogram to '%s' ..." % CHART_FN | |
plt.savefig(CHART_FN) | |
if __name__ == '__main__': | |
print 'Downloading genres...' | |
genre_urls = get_genre_urls(HOMEPAGE_URL) | |
print 'Downloading movie titles...' | |
moviess = map(parse_movies, genre_urls) | |
# flatten list of lists | |
titles = list(chain(*moviess)) | |
# Write movie titles to file in case you want to run again without | |
# downloading everything | |
print "Saving movie titles to '%s' ..." % TITLES_FN | |
print_to_file(titles, TITLES_FN) | |
# If you want to rerun without downloading everything, comment out above | |
# and start here: | |
#titles = get_from_file(TITLES_FN) | |
# parse year from title strings | |
print 'Parsing years...' | |
years = map(get_year, titles) | |
# filter out non-matches | |
years = filter(None, years) | |
# build and save the graph | |
create_plot(years) | |
print 'Done.' | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment