MechCoder/scrape_imdb.py

## scrape_imdb.py
import csv
import urllib
import pickle

from urllib.error import HTTPError
from bs4 import BeautifulSoup

# The logic here is messy but it seems to work.
# The idea is to pass the html read from urllib to BeautifulSoup
# which will return a soup object.
# Then from the raw html page you just need to find the tags around the things you need
# For example, this is the html around the name of the director.
# So you can just use soup.find(itemprop="director").get_text() and manipulate
# the text returned either using regex or string find operations to just extract the name.


#     <div class="txt-block" itemprop="director" itemscope itemtype="http://schema.org/Person">
#         <h4 class="inline">Director:</h4>
# <a href="/name/nm0005124/?ref_=tt_ov_dr"
# itemprop='url'><span class="itemprop" itemprop="name">John Lasseter</span></a>
#     </div>

movie_dict = {}


# Read as a csv file. Change path to actual csv file.
with open('/home/manoj/Downloads/ml-latest/links.csv', 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    i = 0

    for row in reader:

    	# Get the 2nd column which is the link.
        link = str(row[-2])

        # Because the first row is the csv header we ignore it.
        if i != 0:
            link = "http://www.imdb.com/title/tt" + link
            requests = urllib.request.urlopen(link)
            html = requests.read()

            # Get the html in the form of a soup to easily extract data.
            soup = BeautifulSoup(html)
            movie = soup.title.get_text()
            print("Processing movie %s" % movie)

            # Get the movie name and remove the part that comes after '('
            paren = movie.find('(')
            movie = movie[:paren].strip()

            # Get the name of the director, strip whitespaces, start after the 9th
            # alphabet to remove Director: and then strip whitespaces again.
            director_soup = soup.find(itemprop="director")
            director = director_soup.get_text().strip()[9:].strip()
            print("director is %s" % director)
            director_dict = {'director': director}

            # Get the name of the creator, remove the first name that is the director
            # and the others are the list of producers.
            writer_production_soup = soup.find_all(itemprop='creator')
            production_soup = writer_production_soup[1:]
            productions_list = []
            for production in production_soup:
                productions_list.append(production.get_text().strip())
            print("Producers are %s" % productions_list)
            productions_dict = {'producer': productions_list}

            writers = []
            if productions_list:
	        writers_soup = writer_production_soup[0]
	        writers_list = writers_soup.get_text().strip().split('\n')[1:3]
	        for pre_writer in writers_list:
	            find_paren = pre_writer.find('(')
	            if find_paren != -1:
	                writer = pre_writer[:find_paren].strip()
	                find_comma = writer.find(',')
	                if find_comma != -1:
	                    writer = writer[:find_comma]
	                if writer not in writers:
	                    writers.append(writer)
	            else:
	                writer = pre_writer.strip()
	                if writer != ',':
	                    find_comma = writer.find(',')
	                    if find_comma != -1:
	                        writer = writer[:find_comma]
	                    if writer not in writers:
                                writers.append(writer)
            writer_dict = {'writer': writers}
            print(writer_dict)
            movie_dict[movie] = [director_dict, productions_dict, writer_dict]
        i += 1

output = open('movie_dict.pkl', 'wb')
pickle.dump(movie_dict, output)
	import csv
	import urllib
	import pickle

	from urllib.error import HTTPError
	from bs4 import BeautifulSoup

	# The logic here is messy but it seems to work.
	# The idea is to pass the html read from urllib to BeautifulSoup
	# which will return a soup object.
	# Then from the raw html page you just need to find the tags around the things you need
	# For example, this is the html around the name of the director.
	# So you can just use soup.find(itemprop="director").get_text() and manipulate
	# the text returned either using regex or string find operations to just extract the name.


	# <div class="txt-block" itemprop="director" itemscope itemtype="http://schema.org/Person">
	# <h4 class="inline">Director:</h4>
	# <a href="/name/nm0005124/?ref_=tt_ov_dr"
	# itemprop='url'><span class="itemprop" itemprop="name">John Lasseter</span></a>
	# </div>

	movie_dict = {}


	# Read as a csv file. Change path to actual csv file.
	with open('/home/manoj/Downloads/ml-latest/links.csv', 'r') as csvfile:
	reader = csv.reader(csvfile, delimiter=',')
	i = 0

	for row in reader:

	# Get the 2nd column which is the link.
	link = str(row[-2])

	# Because the first row is the csv header we ignore it.
	if i != 0:
	link = "http://www.imdb.com/title/tt" + link
	requests = urllib.request.urlopen(link)
	html = requests.read()

	# Get the html in the form of a soup to easily extract data.
	soup = BeautifulSoup(html)
	movie = soup.title.get_text()
	print("Processing movie %s" % movie)

	# Get the movie name and remove the part that comes after '('
	paren = movie.find('(')
	movie = movie[:paren].strip()

	# Get the name of the director, strip whitespaces, start after the 9th
	# alphabet to remove Director: and then strip whitespaces again.
	director_soup = soup.find(itemprop="director")
	director = director_soup.get_text().strip()[9:].strip()
	print("director is %s" % director)
	director_dict = {'director': director}

	# Get the name of the creator, remove the first name that is the director
	# and the others are the list of producers.
	writer_production_soup = soup.find_all(itemprop='creator')
	production_soup = writer_production_soup[1:]
	productions_list = []
	for production in production_soup:
	productions_list.append(production.get_text().strip())
	print("Producers are %s" % productions_list)
	productions_dict = {'producer': productions_list}

	writers = []
	if productions_list:
	writers_soup = writer_production_soup[0]
	writers_list = writers_soup.get_text().strip().split('\n')[1:3]
	for pre_writer in writers_list:
	find_paren = pre_writer.find('(')
	if find_paren != -1:
	writer = pre_writer[:find_paren].strip()
	find_comma = writer.find(',')
	if find_comma != -1:
	writer = writer[:find_comma]
	if writer not in writers:
	writers.append(writer)
	else:
	writer = pre_writer.strip()
	if writer != ',':
	find_comma = writer.find(',')
	if find_comma != -1:
	writer = writer[:find_comma]
	if writer not in writers:
	writers.append(writer)
	writer_dict = {'writer': writers}
	print(writer_dict)
	movie_dict[movie] = [director_dict, productions_dict, writer_dict]
	i += 1

	output = open('movie_dict.pkl', 'wb')
	pickle.dump(movie_dict, output)