Skip to content

Instantly share code, notes, and snippets.

@rhiever
Last active August 29, 2015 14:13
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rhiever/b4806dfa7304df741a0e to your computer and use it in GitHub Desktop.
Save rhiever/b4806dfa7304df741a0e to your computer and use it in GitHub Desktop.
import urllib2
import time
import os
# Make a directory to store all of the HTML pages
os.system("mkdir pages")
# Download the raw HTML of all pages
for year in range(1982, 2015):
for week in range(1, 53):
try:
page_text = urllib2.urlopen("http://boxofficemojo.com/weekly/chart/?yr=%d&wk=%d&p=.htm" % (year, week)).read()
with open("pages/%d-%d.txt" % (year, week), "wb") as out_file:
out_file.write(page_text)
time.sleep(1)
except:
print("Error with week %d of %d" % (week, year))
# Parse all of the HTML into a tsv
from glob import glob
from BeautifulSoup import BeautifulSoup
with open("top-movies-by-week.tsv", "wb") as out_file:
header = ["year", "week", "rank_this_week", "rank_last_week", "title", "studio", "weekly_gross",
"pct_change", "theater_count", "theater_count_change", "average_gross_per_theater",
"total_gross_so_far", "budget", "weeks_running"]
out_file.write("\t".join(header))
for filename in sorted(glob("pages/*.txt")):
date = filename.split("/")[-1].split(".")[0]
year = date.split("-")[0]
week = date.split("-")[1]
page_text = ""
with open(filename, "rb") as in_file:
page_text = in_file.read()
soup = BeautifulSoup(page_text)
movie_table = soup.findAll("table")[3]
movie_entries = movie_table.findAll("tr")[3:-2]
for movie_entry in movie_entries:
movie_lines = movie_entry.findAll("td")
entries = ["0" if x.text == "-" else x.text.encode("utf-8", "replace") for x in movie_lines]
entries = [str(year), str(week)] + entries
if len(entries) != len(header):
print("Error: %s, %s" % (filename, entries[2]))
out_file.write("\n" + "\t".join(entries))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment