jbeluch/scrape.py

## scrape.py
#!/usr/bin/env python
'''
Jonathan Beluch - web@jonathanbeluch.com

Scrapes movie titles from http://www.classiccinemaonline.com and
creates a histogram showing the distribution of titles by year.

Requires BeautifulSoup, matplotlib and numpy.
'''

from BeautifulSoup import BeautifulSoup as BS
from urlparse import urljoin, parse_qs
from urllib import urlencode
from itertools import chain
import urllib2
import re

BASE_URL = 'http://www.classiccinemaonline.com'
HOMEPAGE_URL = 'http://www.classiccinemaonline.com/1/index.php'
CHART_FN = 'histogram.png'
TITLES_FN = 'movie_titles'

def get_page(url, data=None):
    u = urllib2.urlopen(url, data)
    src = u.read()
    u.close()
    return src

def get_genre_urls(url):
    src = get_page(url)

    #fix terrible html so beautiful soup doesn't barf
    src = src.replace('</font color>', '</font>')
    src = src.replace(r'<ol class=\"latestnews \">', '<ol class="latestnews">')

    html = BS(src)

    # <a> tag class names aren't consistent, so grab the 'rightcol' div and
    # parse the child <a> tags
    div = html.find('div', {'id': 'rightcol'})
    return [urljoin(BASE_URL, a['href']) for a in div.ul.findAll('a')]

def parse_movies(url):
    _, qs = url.split('?', 1)
    params = parse_qs(qs)
    data = {'id': params['id'][0], 'limit': '0'}
    src = get_page(url, urlencode(data))
    html = BS(src)
    print 'Parsing movies from \'%s\'' % html.title.string.strip()
    tr_tags = html.findAll('tr', {'class': re.compile('sectiontableentry')})
    return [tr.a.string.strip() for tr in tr_tags]

p = re.compile(r'\((\d{4})\)')
def get_year(title):
    m = p.search(title)
    if m:
        return int(m.group(1))
    print "Warning: No year match for '%s'." % title

def print_to_file(items, fn):
    with open(fn, 'w') as f:
        f.writelines((str(i) + '\n' for i in items))

def get_from_file(fn):
    with open(fn) as f:
        lines = map(str.strip, f.readlines())
    return lines

def create_plot(years):
    import numpy as np
    import matplotlib.pyplot as plt
    from matplotlib.ticker import MultipleLocator

    # Set up the figure and main plot
    fig = plt.figure()
    ax= fig.add_subplot(111)

    # Calculate the range so we can ensure each bin covers a 1 year period
    bin_range = max(years) - min(years)
    n, bins, patches = ax.hist(years, bins=bin_range, align='left')

    # Change major axes tickers to multiples of 5 instead of default 10
    ax.xaxis.set_major_locator(MultipleLocator(5))
    ax.yaxis.set_major_locator(MultipleLocator(5))
    ax.grid(True)

    # Set up axes labels and plot title
    ax.set_ylabel('Number of Movies + TV Shows')
    ax.set_xlabel('Year')
    ax.set_title('Number of Movies and TV Shows per Year')

    # Enlarge the output figure so things aren't as cramped
    fig = plt.gcf()
    # Defaults is (8, 6)
    fig.set_size_inches(12, 9)

    # Save output
    print "Saving histogram to '%s' ..." % CHART_FN
    plt.savefig(CHART_FN)

if __name__ == '__main__':
    print 'Downloading genres...'
    genre_urls = get_genre_urls(HOMEPAGE_URL)

    print 'Downloading movie titles...'
    moviess = map(parse_movies, genre_urls)

    # flatten list of lists
    titles = list(chain(*moviess))

    # Write movie titles to file in case you want to run again without
    # downloading everything
    print "Saving movie titles to '%s' ..." % TITLES_FN
    print_to_file(titles, TITLES_FN)

    # If you want to rerun without downloading everything, comment out above
    # and start here:
    #titles = get_from_file(TITLES_FN)

    # parse year from title strings
    print 'Parsing years...'
    years = map(get_year, titles)

    # filter out non-matches
    years = filter(None, years)

    # build and save the graph
    create_plot(years)

    print 'Done.'
	#!/usr/bin/env python
	'''
	Jonathan Beluch - web@jonathanbeluch.com

	Scrapes movie titles from http://www.classiccinemaonline.com and
	creates a histogram showing the distribution of titles by year.

	Requires BeautifulSoup, matplotlib and numpy.
	'''

	from BeautifulSoup import BeautifulSoup as BS
	from urlparse import urljoin, parse_qs
	from urllib import urlencode
	from itertools import chain
	import urllib2
	import re

	BASE_URL = 'http://www.classiccinemaonline.com'
	HOMEPAGE_URL = 'http://www.classiccinemaonline.com/1/index.php'
	CHART_FN = 'histogram.png'
	TITLES_FN = 'movie_titles'

	def get_page(url, data=None):
	u = urllib2.urlopen(url, data)
	src = u.read()
	u.close()
	return src

	def get_genre_urls(url):
	src = get_page(url)

	#fix terrible html so beautiful soup doesn't barf
	src = src.replace('</font color>', '</font>')
	src = src.replace(r'<ol class=\"latestnews \">', '<ol class="latestnews">')

	html = BS(src)

	# <a> tag class names aren't consistent, so grab the 'rightcol' div and
	# parse the child <a> tags
	div = html.find('div', {'id': 'rightcol'})
	return [urljoin(BASE_URL, a['href']) for a in div.ul.findAll('a')]

	def parse_movies(url):
	_, qs = url.split('?', 1)
	params = parse_qs(qs)
	data = {'id': params['id'][0], 'limit': '0'}
	src = get_page(url, urlencode(data))
	html = BS(src)
	print 'Parsing movies from \'%s\'' % html.title.string.strip()
	tr_tags = html.findAll('tr', {'class': re.compile('sectiontableentry')})
	return [tr.a.string.strip() for tr in tr_tags]

	p = re.compile(r'\((\d{4})\)')
	def get_year(title):
	m = p.search(title)
	if m:
	return int(m.group(1))
	print "Warning: No year match for '%s'." % title

	def print_to_file(items, fn):
	with open(fn, 'w') as f:
	f.writelines((str(i) + '\n' for i in items))

	def get_from_file(fn):
	with open(fn) as f:
	lines = map(str.strip, f.readlines())
	return lines

	def create_plot(years):
	import numpy as np
	import matplotlib.pyplot as plt
	from matplotlib.ticker import MultipleLocator

	# Set up the figure and main plot
	fig = plt.figure()
	ax= fig.add_subplot(111)

	# Calculate the range so we can ensure each bin covers a 1 year period
	bin_range = max(years) - min(years)
	n, bins, patches = ax.hist(years, bins=bin_range, align='left')

	# Change major axes tickers to multiples of 5 instead of default 10
	ax.xaxis.set_major_locator(MultipleLocator(5))
	ax.yaxis.set_major_locator(MultipleLocator(5))
	ax.grid(True)

	# Set up axes labels and plot title
	ax.set_ylabel('Number of Movies + TV Shows')
	ax.set_xlabel('Year')
	ax.set_title('Number of Movies and TV Shows per Year')

	# Enlarge the output figure so things aren't as cramped
	fig = plt.gcf()
	# Defaults is (8, 6)
	fig.set_size_inches(12, 9)

	# Save output
	print "Saving histogram to '%s' ..." % CHART_FN
	plt.savefig(CHART_FN)

	if __name__ == '__main__':
	print 'Downloading genres...'
	genre_urls = get_genre_urls(HOMEPAGE_URL)

	print 'Downloading movie titles...'
	moviess = map(parse_movies, genre_urls)

	# flatten list of lists
	titles = list(chain(*moviess))

	# Write movie titles to file in case you want to run again without
	# downloading everything
	print "Saving movie titles to '%s' ..." % TITLES_FN
	print_to_file(titles, TITLES_FN)

	# If you want to rerun without downloading everything, comment out above
	# and start here:
	#titles = get_from_file(TITLES_FN)

	# parse year from title strings
	print 'Parsing years...'
	years = map(get_year, titles)

	# filter out non-matches
	years = filter(None, years)

	# build and save the graph
	create_plot(years)

	print 'Done.'