Skip to content

Instantly share code, notes, and snippets.

@MechCoder
Last active April 15, 2016 01:00
Show Gist options
  • Save MechCoder/27609a754a43b878343a to your computer and use it in GitHub Desktop.
Save MechCoder/27609a754a43b878343a to your computer and use it in GitHub Desktop.
import csv
import urllib
import pickle
from urllib.error import HTTPError
from bs4 import BeautifulSoup
# The logic here is messy but it seems to work.
# The idea is to pass the html read from urllib to BeautifulSoup
# which will return a soup object.
# Then from the raw html page you just need to find the tags around the things you need
# For example, this is the html around the name of the director.
# So you can just use soup.find(itemprop="director").get_text() and manipulate
# the text returned either using regex or string find operations to just extract the name.
# <div class="txt-block" itemprop="director" itemscope itemtype="http://schema.org/Person">
# <h4 class="inline">Director:</h4>
# <a href="/name/nm0005124/?ref_=tt_ov_dr"
# itemprop='url'><span class="itemprop" itemprop="name">John Lasseter</span></a>
# </div>
movie_dict = {}
# Read as a csv file. Change path to actual csv file.
with open('/home/manoj/Downloads/ml-latest/links.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
i = 0
for row in reader:
# Get the 2nd column which is the link.
link = str(row[-2])
# Because the first row is the csv header we ignore it.
if i != 0:
link = "http://www.imdb.com/title/tt" + link
requests = urllib.request.urlopen(link)
html = requests.read()
# Get the html in the form of a soup to easily extract data.
soup = BeautifulSoup(html)
movie = soup.title.get_text()
print("Processing movie %s" % movie)
# Get the movie name and remove the part that comes after '('
paren = movie.find('(')
movie = movie[:paren].strip()
# Get the name of the director, strip whitespaces, start after the 9th
# alphabet to remove Director: and then strip whitespaces again.
director_soup = soup.find(itemprop="director")
director = director_soup.get_text().strip()[9:].strip()
print("director is %s" % director)
director_dict = {'director': director}
# Get the name of the creator, remove the first name that is the director
# and the others are the list of producers.
writer_production_soup = soup.find_all(itemprop='creator')
production_soup = writer_production_soup[1:]
productions_list = []
for production in production_soup:
productions_list.append(production.get_text().strip())
print("Producers are %s" % productions_list)
productions_dict = {'producer': productions_list}
writers = []
if productions_list:
writers_soup = writer_production_soup[0]
writers_list = writers_soup.get_text().strip().split('\n')[1:3]
for pre_writer in writers_list:
find_paren = pre_writer.find('(')
if find_paren != -1:
writer = pre_writer[:find_paren].strip()
find_comma = writer.find(',')
if find_comma != -1:
writer = writer[:find_comma]
if writer not in writers:
writers.append(writer)
else:
writer = pre_writer.strip()
if writer != ',':
find_comma = writer.find(',')
if find_comma != -1:
writer = writer[:find_comma]
if writer not in writers:
writers.append(writer)
writer_dict = {'writer': writers}
print(writer_dict)
movie_dict[movie] = [director_dict, productions_dict, writer_dict]
i += 1
output = open('movie_dict.pkl', 'wb')
pickle.dump(movie_dict, output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment