Last active
April 15, 2016 01:00
-
-
Save MechCoder/27609a754a43b878343a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import urllib | |
import pickle | |
from urllib.error import HTTPError | |
from bs4 import BeautifulSoup | |
# The logic here is messy but it seems to work. | |
# The idea is to pass the html read from urllib to BeautifulSoup | |
# which will return a soup object. | |
# Then from the raw html page you just need to find the tags around the things you need | |
# For example, this is the html around the name of the director. | |
# So you can just use soup.find(itemprop="director").get_text() and manipulate | |
# the text returned either using regex or string find operations to just extract the name. | |
# <div class="txt-block" itemprop="director" itemscope itemtype="http://schema.org/Person"> | |
# <h4 class="inline">Director:</h4> | |
# <a href="/name/nm0005124/?ref_=tt_ov_dr" | |
# itemprop='url'><span class="itemprop" itemprop="name">John Lasseter</span></a> | |
# </div> | |
movie_dict = {} | |
# Read as a csv file. Change path to actual csv file. | |
with open('/home/manoj/Downloads/ml-latest/links.csv', 'r') as csvfile: | |
reader = csv.reader(csvfile, delimiter=',') | |
i = 0 | |
for row in reader: | |
# Get the 2nd column which is the link. | |
link = str(row[-2]) | |
# Because the first row is the csv header we ignore it. | |
if i != 0: | |
link = "http://www.imdb.com/title/tt" + link | |
requests = urllib.request.urlopen(link) | |
html = requests.read() | |
# Get the html in the form of a soup to easily extract data. | |
soup = BeautifulSoup(html) | |
movie = soup.title.get_text() | |
print("Processing movie %s" % movie) | |
# Get the movie name and remove the part that comes after '(' | |
paren = movie.find('(') | |
movie = movie[:paren].strip() | |
# Get the name of the director, strip whitespaces, start after the 9th | |
# alphabet to remove Director: and then strip whitespaces again. | |
director_soup = soup.find(itemprop="director") | |
director = director_soup.get_text().strip()[9:].strip() | |
print("director is %s" % director) | |
director_dict = {'director': director} | |
# Get the name of the creator, remove the first name that is the director | |
# and the others are the list of producers. | |
writer_production_soup = soup.find_all(itemprop='creator') | |
production_soup = writer_production_soup[1:] | |
productions_list = [] | |
for production in production_soup: | |
productions_list.append(production.get_text().strip()) | |
print("Producers are %s" % productions_list) | |
productions_dict = {'producer': productions_list} | |
writers = [] | |
if productions_list: | |
writers_soup = writer_production_soup[0] | |
writers_list = writers_soup.get_text().strip().split('\n')[1:3] | |
for pre_writer in writers_list: | |
find_paren = pre_writer.find('(') | |
if find_paren != -1: | |
writer = pre_writer[:find_paren].strip() | |
find_comma = writer.find(',') | |
if find_comma != -1: | |
writer = writer[:find_comma] | |
if writer not in writers: | |
writers.append(writer) | |
else: | |
writer = pre_writer.strip() | |
if writer != ',': | |
find_comma = writer.find(',') | |
if find_comma != -1: | |
writer = writer[:find_comma] | |
if writer not in writers: | |
writers.append(writer) | |
writer_dict = {'writer': writers} | |
print(writer_dict) | |
movie_dict[movie] = [director_dict, productions_dict, writer_dict] | |
i += 1 | |
output = open('movie_dict.pkl', 'wb') | |
pickle.dump(movie_dict, output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment