Skip to content

Instantly share code, notes, and snippets.

@kartoch
Last active November 5, 2016 15:36
Show Gist options
  • Save kartoch/5f3f6e75deb10c6fad35 to your computer and use it in GitHub Desktop.
Save kartoch/5f3f6e75deb10c6fad35 to your computer and use it in GitHub Desktop.
The script connect to several IMDB pages to gather the whole list of episodes as JSON, each entry includes the cast as returned by IMDB. Great dataset for beginning a mongodb lab for my students.
# This script connect to several IMDB pages to gather the whole list of episodes as JSON, each entry includes the cast as
# returned by IMDB. Great dataset for beginning a mongodb lab for my students.
from bs4 import BeautifulSoup
from dateutil import parser
import dateutil
import json
import locale
import logging
import re
import requests
import sys
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' )
file = open("friends-full.json",'w')
episode_counter = 0
def seasons_url_generator():
for season in range(1,11):
yield("http://www.imdb.com/title/tt0108778/episodes?season=" + str(season), season)
def seasons_extract(season, entry):
global episode_counter
episode_counter += 1
data = {
"position" : {
"season" : season,
"episode" : int(entry.meta["content"])
},
"number" : episode_counter,
"airdate" : entry.find("div", {"class" : "airdate"}).contents[0].strip(),
"title" : entry.strong.a["title"],
"description" : entry.find("div", {"itemprop" : "description"}).contents[0].strip(),
"cast" : []
}
url = "http://www.imdb.com" + entry.strong.a["href"]
url = url[:url.index('?')]
episodes_extract(url,data)
json.dump(data, file)
def episodes_extract(url, episode_data):
logger.info('Get URL for episode ' + str(episode_data['position']['episode']) + " of season " + str(episode_data['position']['season']))
logger.debug('URL: ' + url)
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
episode_data["ratingValue"] = locale.atof(soup.find("span", { "itemprop" : "ratingValue" }).contents[0].strip())
episode_data["ratingCount"] = locale.atoi(soup.find("span", { "itemprop" : "ratingCount" }).contents[0].strip())
fullcast_extract(url + "fullcredits", episode_data)
def fullcast_extract(url, episode_data):
logger.info('Get URL for fullcredit of episode ' + str(episode_data['position']['episode']) + " of season " + str(episode_data['position']['season']))
logger.debug('URL: ' + url)
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
credits = soup.find("div", { "id" : "fullcredits_content"}).findAll(["table","h4"])
# director(s)
assert "Directed by" in credits[0].contents[0]
episode_data["directors"] = []
for directors in credits[1].tbody.findAll("tr"):
director = directors.find("td", {"class" : "name"})
if director is None:
continue
elif director.find('a') is None:
episode_data["directors"].append(director.contents[0].strip())
else:
episode_data["directors"].append(director.find('a').contents[0].strip())
assert len(episode_data["directors"]) > 0
# writers
assert "Writing Credits" in credits[2].contents[0]
episode_data["writers"] = []
for writers in credits[3].tbody.findAll("tr"):
writer = writers.find("td", {"class" : "name"})
if writer is None:
continue
elif writer.find('a') is None:
episode_data["writers"].append(writer.contents[0].strip())
else:
episode_data["writers"].append(writer.find('a').contents[0].strip())
episode_data["writers"] = sorted(set(episode_data["writers"]))
assert len(episode_data["writers"]) > 0
# cast
episode_data["cast"] = []
actor_entries = soup.find("table", { "class": "cast_list"}).findAll("tr", { "class" : ["odd","even"]})
for entry in actor_entries:
actor_extract(entry, episode_data)
assert len(episode_data["cast"]) > 0
def actor_extract(entry, episode_data):
data = {
"actor" : entry.find("td",{ "itemprop" : "actor"}).text.strip(),
"character" : entry.find("td",{ "class" : "character"}).text.strip()
.replace("\n ","").replace("/ ","/ ")
}
episode_data["cast"].append(data)
for (url,season) in seasons_url_generator():
logger.info('Get URL for season ' + str(season))
logger.debug('URL: ' + url)
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
episode_entries = soup.findAll("div", { "class" : "info", "itemprop": "episodes"})
for entry in episode_entries:
seasons_extract(season, entry)
file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment