cflint12/IMDB_Initial_WebScraping.py

## IMDB_Initial_WebScraping.py
'''
This program was generated to scrape movie data from the IMDB website
in conjunction with omdbapi.com
'''

#Import the necessary libraries
import urllib2
from bs4 import BeautifulSoup as bs
import pandas as pd
import os

#Change directory to the folder you would like to store everything in:

#os.chdir('Set working directory here ...')

'''
Define a function to scrape a list of movie titles from IMDB.
The list should also contain the year the movie was released
and its box office earnings (in millions of US dollars).
The input to the function is the number of titles desired.

The function pulls the movie titles in descending order of box office earnings.
'''

def Get_Movie_Titles(number_of_titles):
    IMDB_URLs = [] #Create empty list to receive IMDB URLs (each URL accounts for 100 movie titles)
    increments = range(1,number_of_titles,100) #Create index set for the number of movie titles requested
    for i in increments: #Generate a URL for each set of 100 movie titles to be scraped
        URL = 'http://www.imdb.com/search/title?at=0&count=100&sort=boxoffice_gross_us&start=' + str(i) + '&title_type=feature,tv_series,tv_movie'
        IMDB_URLs.append(URL) #Place the URLs into a list

    #Create empty lists for each of the fields we would like to collect
    BoxOfficeList = []
    titles_preprocess = [] #This list will store information from all a tags that contain an attribute 'title', it will then feed into more processing
    titles = [] #This list will store all of the legitimate titles, as pulled from titles_preprocess
    cleaned_titles = [] #This list will separate the movie titles from the years and prepare to format the titles to be placed into the required omdbapi.com format to pull additional JSON data about the movie

    for URL in IMDB_URLs: #This loop pulls titles from each of the URLs generated in IMDB_URLs
        Input_URL = urllib2.urlopen(URL) #Open the URL
        soup = bs(Input_URL)             #Store the opened URL as a soup object
        allTextA = soup.findAll('a') #Find and store all of the 'a' tags, which contain the titles
        allTextBoxOffice = soup.findAll('td', { "class" : "sort_col" }) #Find all of the TD tags that include the class that was used to sort (Box Office earnings)
        for i in allTextBoxOffice: #Store all of the sort_col strings (box office earnings)
            BoxOfficeList.append(i.string)
        for a in allTextA: #Store all of the href title attributes in the list titles_preprocess
            if '/title/' in a['href'] and a.has_attr('title'):
                titles_preprocess.append(a)

    for i in titles_preprocess: #Sort through all of the titles and only keep movie titles in the list "titles"
        if i['title'] == 'Register or login to rate this title' or i['title'] == 'Delete' or i['title'] == 'Home':
            continue
        if 'TV Series' in i['title']:
            continue
        titles.append(i['title'])

    for title in titles: #Clean the movie titles by separating out the year and taking away parentheses and blank spaces
        title = title.rsplit('(',1) #Split the year from the title
        title[0] = title[0].strip() #Strip extra spaces
        title[0] = title[0].replace('\xe9','e') #Replace accented e with e
        title[0] = title[0].replace('\xfc','u') #Replace omlatted u with u
        title[1] = title[1][0:4]
        cleaned_titles.append(title)

    for i in range(len(cleaned_titles)): #Combine the cleaned titles and years with the box office earnings
        cleaned_titles[i].append(BoxOfficeList[i])

    return cleaned_titles #Return a list of lists, which contains the title, year and box office earnings for each movie


'''
Next, define a function to utilize the list of cleaned movie titles and years
to pull additional data (IMDB ratings, release date, actors, director, awards, rating, etc.)
in the form of a JSON string from omdbapi.com (Open Movie Database)

The function reformats the titles to place them into URLs compatible with the site,
then pulls the data generated from each URL into a new list (movie_data), which it returns.
'''

def Get_Movie_Data(cleaned_titles):
    OMDB_URLs = [] #Create empty list to store a URL for each movie, from which the JSON data will be pulled

    for title in cleaned_titles: #Reformat each of the titles so that it can be placed into the URLs
        title[0] = title[0].replace('&','%26') #Replace & in the titles with the correct format for the URL
        title[0] = title[0].replace(':','%3A') #Replace : in the titles with the correct format for the URL
        scraped_title = title[0] = title[0].replace(' ','+') #replace spaces with + signs
        scraped_year = title[1] #Store the year released
        OMDB_URL = 'http://www.omdbapi.com/?t=' + scraped_title + '&y=' + scraped_year + '&plot=short&r=json' #Generate the URL
        OMDB_URLs.append(OMDB_URL) #Store the URL

    movie_data = [] #Create a list to store the JSON strings

    for i in OMDB_URLs: #For each URL, get the matching JSON string
        URL = urllib2.urlopen(i) #Open the URL
        soup = bs(URL) #Save it as a soup object
        movie_data.append(soup.p.string) #Append the JSON string to the list

    return movie_data #Return the list of data

'''
Next, define a function that will take the JSON data string and parse it into a dictionary
for each of the fields.
'''

def Extract_Movie_Data(JSON_Data): #The function takes a JSON string
    Element_String = str(JSON_Data[1:-1]).split('"') #Split the string based on quotation marks

    Element_String = [i.replace('"','') for i in Element_String] #Delete double quotes

    Test_Cleaned_Element_String = [] #Create empty list

    for element in Element_String: #Get rid of commas and colons
        if element is ',' or element is ':' or element is '':
            continue
        Test_Cleaned_Element_String.append(element)

    Cleaned = Test_Cleaned_Element_String[0:40] #Only keep the first 40 elements (ignore repeats)

    Cleaned_Keys = []
    Cleaned_Values = []

    for i in range(len(Cleaned)): #Since keys and values alternate order, we can parse based on every other element (odd vs. even)
        if i%2 != 0:
            Cleaned_Values.append(Cleaned[i])
        else:
            Cleaned_Keys.append(Cleaned[i])

    Lists_for_Dict = zip(Cleaned_Keys,Cleaned_Values) #Zip the lists of keys and values together as tuples

    Movie_Data_Dict = dict(Lists_for_Dict) #Convert the tuples into dictionary key, value pairs

    return Movie_Data_Dict #Return the dictionary

'''
Next, use the functions that were just defined to pull the movie data.
'''

Movie_Titles = Get_Movie_Titles(5000) #Get the top 5000 movie titles from IMDB
Clean_Movie_Titles = Movie_Titles
Movie_Data = Get_Movie_Data(Movie_Titles) #Pull the data from each of those titles

Combined_Movie_List = [] #Create an empty list to hold title, year, box office earnings and JSON data together
for i in range(len(Movie_Titles)): #Populate the list
    Combined_Movie_List.append([Clean_Movie_Titles[i][0], Clean_Movie_Titles[i][1], Clean_Movie_Titles[i][2], Movie_Data[i]])


JSON_List = [i[3] for i in Combined_Movie_List] #Create list of just the JSON data

Movie_List_of_Dict = []

for i in JSON_List: #Use the function previously defined to create a dictionary for all JSON strings
    movie_data = Extract_Movie_Data(i)
    Movie_List_of_Dict.append(movie_data) #Place all of the newly created dictionaries into a list

Movie_Dict_Keys = Movie_List_of_Dict[0].keys() #Store all of the dictionary key values as a list

Movie_Dict = {}
Box_Office_Dict = {}

for i in range(len(Clean_Movie_Titles)): #Create a dictionary of all of the box office values, using the movie title as a key
    Box_Office_Dict[Clean_Movie_Titles[i][0]] = Clean_Movie_Titles[i][2]

for i in range(len(Clean_Movie_Titles)): #Create a dictionary of all of the dictionaries (from JSON), using the movie title as a key
    Movie_Dict[Clean_Movie_Titles[i][0]] = Movie_List_of_Dict[i]

Movie_DF = pd.DataFrame(Movie_Dict, index = Movie_Dict_Keys) #Create data frame out of dictionaries
Movie_DF = Movie_DF.append(Box_Office_Dict, ignore_index = True) #Add box office earnings info
Movie_DF = Movie_DF.T #Transpose the data frame to make it more manageable

columns = Movie_Dict_Keys #Create column names out of the dictionary keys
columns.append('Box Office Earnings') #Add 'Box Office Earnings' to column header list
columns_range = range(len(columns))
columns_dict = {}
for i in range(len(columns)): #Create dictionary to match old column headers with new column headers
    columns_dict[columns_range[i]] = columns[i]

Movie_DF = Movie_DF.rename(columns = columns_dict) #Rename the column headers

Movie_DF.to_csv('Movie_DF_11_20_2014.csv') #Save the data frame to a CSV file for future use
	'''
	This program was generated to scrape movie data from the IMDB website
	in conjunction with omdbapi.com
	'''

	#Import the necessary libraries
	import urllib2
	from bs4 import BeautifulSoup as bs
	import pandas as pd
	import os

	#Change directory to the folder you would like to store everything in:

	#os.chdir('Set working directory here ...')

	'''
	Define a function to scrape a list of movie titles from IMDB.
	The list should also contain the year the movie was released
	and its box office earnings (in millions of US dollars).
	The input to the function is the number of titles desired.

	The function pulls the movie titles in descending order of box office earnings.
	'''

	def Get_Movie_Titles(number_of_titles):
	IMDB_URLs = [] #Create empty list to receive IMDB URLs (each URL accounts for 100 movie titles)
	increments = range(1,number_of_titles,100) #Create index set for the number of movie titles requested
	for i in increments: #Generate a URL for each set of 100 movie titles to be scraped
	URL = 'http://www.imdb.com/search/title?at=0&count=100&sort=boxoffice_gross_us&start=' + str(i) + '&title_type=feature,tv_series,tv_movie'
	IMDB_URLs.append(URL) #Place the URLs into a list

	#Create empty lists for each of the fields we would like to collect
	BoxOfficeList = []
	titles_preprocess = [] #This list will store information from all a tags that contain an attribute 'title', it will then feed into more processing
	titles = [] #This list will store all of the legitimate titles, as pulled from titles_preprocess
	cleaned_titles = [] #This list will separate the movie titles from the years and prepare to format the titles to be placed into the required omdbapi.com format to pull additional JSON data about the movie

	for URL in IMDB_URLs: #This loop pulls titles from each of the URLs generated in IMDB_URLs
	Input_URL = urllib2.urlopen(URL) #Open the URL
	soup = bs(Input_URL) #Store the opened URL as a soup object
	allTextA = soup.findAll('a') #Find and store all of the 'a' tags, which contain the titles
	allTextBoxOffice = soup.findAll('td', { "class" : "sort_col" }) #Find all of the TD tags that include the class that was used to sort (Box Office earnings)
	for i in allTextBoxOffice: #Store all of the sort_col strings (box office earnings)
	BoxOfficeList.append(i.string)
	for a in allTextA: #Store all of the href title attributes in the list titles_preprocess
	if '/title/' in a['href'] and a.has_attr('title'):
	titles_preprocess.append(a)

	for i in titles_preprocess: #Sort through all of the titles and only keep movie titles in the list "titles"
	if i['title'] == 'Register or login to rate this title' or i['title'] == 'Delete' or i['title'] == 'Home':
	continue
	if 'TV Series' in i['title']:
	continue
	titles.append(i['title'])

	for title in titles: #Clean the movie titles by separating out the year and taking away parentheses and blank spaces
	title = title.rsplit('(',1) #Split the year from the title
	title[0] = title[0].strip() #Strip extra spaces
	title[0] = title[0].replace('\xe9','e') #Replace accented e with e
	title[0] = title[0].replace('\xfc','u') #Replace omlatted u with u
	title[1] = title[1][0:4]
	cleaned_titles.append(title)

	for i in range(len(cleaned_titles)): #Combine the cleaned titles and years with the box office earnings
	cleaned_titles[i].append(BoxOfficeList[i])

	return cleaned_titles #Return a list of lists, which contains the title, year and box office earnings for each movie


	'''
	Next, define a function to utilize the list of cleaned movie titles and years
	to pull additional data (IMDB ratings, release date, actors, director, awards, rating, etc.)
	in the form of a JSON string from omdbapi.com (Open Movie Database)

	The function reformats the titles to place them into URLs compatible with the site,
	then pulls the data generated from each URL into a new list (movie_data), which it returns.
	'''

	def Get_Movie_Data(cleaned_titles):
	OMDB_URLs = [] #Create empty list to store a URL for each movie, from which the JSON data will be pulled

	for title in cleaned_titles: #Reformat each of the titles so that it can be placed into the URLs
	title[0] = title[0].replace('&','%26') #Replace & in the titles with the correct format for the URL
	title[0] = title[0].replace(':','%3A') #Replace : in the titles with the correct format for the URL
	scraped_title = title[0] = title[0].replace(' ','+') #replace spaces with + signs
	scraped_year = title[1] #Store the year released
	OMDB_URL = 'http://www.omdbapi.com/?t=' + scraped_title + '&y=' + scraped_year + '&plot=short&r=json' #Generate the URL
	OMDB_URLs.append(OMDB_URL) #Store the URL

	movie_data = [] #Create a list to store the JSON strings

	for i in OMDB_URLs: #For each URL, get the matching JSON string
	URL = urllib2.urlopen(i) #Open the URL
	soup = bs(URL) #Save it as a soup object
	movie_data.append(soup.p.string) #Append the JSON string to the list

	return movie_data #Return the list of data

	'''
	Next, define a function that will take the JSON data string and parse it into a dictionary
	for each of the fields.
	'''

	def Extract_Movie_Data(JSON_Data): #The function takes a JSON string
	Element_String = str(JSON_Data[1:-1]).split('"') #Split the string based on quotation marks

	Element_String = [i.replace('"','') for i in Element_String] #Delete double quotes

	Test_Cleaned_Element_String = [] #Create empty list

	for element in Element_String: #Get rid of commas and colons
	if element is ',' or element is ':' or element is '':
	continue
	Test_Cleaned_Element_String.append(element)

	Cleaned = Test_Cleaned_Element_String[0:40] #Only keep the first 40 elements (ignore repeats)

	Cleaned_Keys = []
	Cleaned_Values = []

	for i in range(len(Cleaned)): #Since keys and values alternate order, we can parse based on every other element (odd vs. even)
	if i%2 != 0:
	Cleaned_Values.append(Cleaned[i])
	else:
	Cleaned_Keys.append(Cleaned[i])

	Lists_for_Dict = zip(Cleaned_Keys,Cleaned_Values) #Zip the lists of keys and values together as tuples

	Movie_Data_Dict = dict(Lists_for_Dict) #Convert the tuples into dictionary key, value pairs

	return Movie_Data_Dict #Return the dictionary

	'''
	Next, use the functions that were just defined to pull the movie data.
	'''

	Movie_Titles = Get_Movie_Titles(5000) #Get the top 5000 movie titles from IMDB
	Clean_Movie_Titles = Movie_Titles
	Movie_Data = Get_Movie_Data(Movie_Titles) #Pull the data from each of those titles

	Combined_Movie_List = [] #Create an empty list to hold title, year, box office earnings and JSON data together
	for i in range(len(Movie_Titles)): #Populate the list
	Combined_Movie_List.append([Clean_Movie_Titles[i][0], Clean_Movie_Titles[i][1], Clean_Movie_Titles[i][2], Movie_Data[i]])


	JSON_List = [i[3] for i in Combined_Movie_List] #Create list of just the JSON data

	Movie_List_of_Dict = []

	for i in JSON_List: #Use the function previously defined to create a dictionary for all JSON strings
	movie_data = Extract_Movie_Data(i)
	Movie_List_of_Dict.append(movie_data) #Place all of the newly created dictionaries into a list

	Movie_Dict_Keys = Movie_List_of_Dict[0].keys() #Store all of the dictionary key values as a list

	Movie_Dict = {}
	Box_Office_Dict = {}

	for i in range(len(Clean_Movie_Titles)): #Create a dictionary of all of the box office values, using the movie title as a key
	Box_Office_Dict[Clean_Movie_Titles[i][0]] = Clean_Movie_Titles[i][2]

	for i in range(len(Clean_Movie_Titles)): #Create a dictionary of all of the dictionaries (from JSON), using the movie title as a key
	Movie_Dict[Clean_Movie_Titles[i][0]] = Movie_List_of_Dict[i]

	Movie_DF = pd.DataFrame(Movie_Dict, index = Movie_Dict_Keys) #Create data frame out of dictionaries
	Movie_DF = Movie_DF.append(Box_Office_Dict, ignore_index = True) #Add box office earnings info
	Movie_DF = Movie_DF.T #Transpose the data frame to make it more manageable

	columns = Movie_Dict_Keys #Create column names out of the dictionary keys
	columns.append('Box Office Earnings') #Add 'Box Office Earnings' to column header list
	columns_range = range(len(columns))
	columns_dict = {}
	for i in range(len(columns)): #Create dictionary to match old column headers with new column headers
	columns_dict[columns_range[i]] = columns[i]

	Movie_DF = Movie_DF.rename(columns = columns_dict) #Rename the column headers

	Movie_DF.to_csv('Movie_DF_11_20_2014.csv') #Save the data frame to a CSV file for future use