Created
December 7, 2014 04:58
-
-
Save cflint12/17be988516618b96af05 to your computer and use it in GitHub Desktop.
This program was generated to scrape movie data from the IMDB website in conjunction with omdbapi.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
This program was generated to scrape movie data from the IMDB website | |
in conjunction with omdbapi.com | |
''' | |
#Import the necessary libraries | |
import urllib2 | |
from bs4 import BeautifulSoup as bs | |
import pandas as pd | |
import os | |
#Change directory to the folder you would like to store everything in: | |
#os.chdir('Set working directory here ...') | |
''' | |
Define a function to scrape a list of movie titles from IMDB. | |
The list should also contain the year the movie was released | |
and its box office earnings (in millions of US dollars). | |
The input to the function is the number of titles desired. | |
The function pulls the movie titles in descending order of box office earnings. | |
''' | |
def Get_Movie_Titles(number_of_titles): | |
IMDB_URLs = [] #Create empty list to receive IMDB URLs (each URL accounts for 100 movie titles) | |
increments = range(1,number_of_titles,100) #Create index set for the number of movie titles requested | |
for i in increments: #Generate a URL for each set of 100 movie titles to be scraped | |
URL = 'http://www.imdb.com/search/title?at=0&count=100&sort=boxoffice_gross_us&start=' + str(i) + '&title_type=feature,tv_series,tv_movie' | |
IMDB_URLs.append(URL) #Place the URLs into a list | |
#Create empty lists for each of the fields we would like to collect | |
BoxOfficeList = [] | |
titles_preprocess = [] #This list will store information from all a tags that contain an attribute 'title', it will then feed into more processing | |
titles = [] #This list will store all of the legitimate titles, as pulled from titles_preprocess | |
cleaned_titles = [] #This list will separate the movie titles from the years and prepare to format the titles to be placed into the required omdbapi.com format to pull additional JSON data about the movie | |
for URL in IMDB_URLs: #This loop pulls titles from each of the URLs generated in IMDB_URLs | |
Input_URL = urllib2.urlopen(URL) #Open the URL | |
soup = bs(Input_URL) #Store the opened URL as a soup object | |
allTextA = soup.findAll('a') #Find and store all of the 'a' tags, which contain the titles | |
allTextBoxOffice = soup.findAll('td', { "class" : "sort_col" }) #Find all of the TD tags that include the class that was used to sort (Box Office earnings) | |
for i in allTextBoxOffice: #Store all of the sort_col strings (box office earnings) | |
BoxOfficeList.append(i.string) | |
for a in allTextA: #Store all of the href title attributes in the list titles_preprocess | |
if '/title/' in a['href'] and a.has_attr('title'): | |
titles_preprocess.append(a) | |
for i in titles_preprocess: #Sort through all of the titles and only keep movie titles in the list "titles" | |
if i['title'] == 'Register or login to rate this title' or i['title'] == 'Delete' or i['title'] == 'Home': | |
continue | |
if 'TV Series' in i['title']: | |
continue | |
titles.append(i['title']) | |
for title in titles: #Clean the movie titles by separating out the year and taking away parentheses and blank spaces | |
title = title.rsplit('(',1) #Split the year from the title | |
title[0] = title[0].strip() #Strip extra spaces | |
title[0] = title[0].replace('\xe9','e') #Replace accented e with e | |
title[0] = title[0].replace('\xfc','u') #Replace omlatted u with u | |
title[1] = title[1][0:4] | |
cleaned_titles.append(title) | |
for i in range(len(cleaned_titles)): #Combine the cleaned titles and years with the box office earnings | |
cleaned_titles[i].append(BoxOfficeList[i]) | |
return cleaned_titles #Return a list of lists, which contains the title, year and box office earnings for each movie | |
''' | |
Next, define a function to utilize the list of cleaned movie titles and years | |
to pull additional data (IMDB ratings, release date, actors, director, awards, rating, etc.) | |
in the form of a JSON string from omdbapi.com (Open Movie Database) | |
The function reformats the titles to place them into URLs compatible with the site, | |
then pulls the data generated from each URL into a new list (movie_data), which it returns. | |
''' | |
def Get_Movie_Data(cleaned_titles): | |
OMDB_URLs = [] #Create empty list to store a URL for each movie, from which the JSON data will be pulled | |
for title in cleaned_titles: #Reformat each of the titles so that it can be placed into the URLs | |
title[0] = title[0].replace('&','%26') #Replace & in the titles with the correct format for the URL | |
title[0] = title[0].replace(':','%3A') #Replace : in the titles with the correct format for the URL | |
scraped_title = title[0] = title[0].replace(' ','+') #replace spaces with + signs | |
scraped_year = title[1] #Store the year released | |
OMDB_URL = 'http://www.omdbapi.com/?t=' + scraped_title + '&y=' + scraped_year + '&plot=short&r=json' #Generate the URL | |
OMDB_URLs.append(OMDB_URL) #Store the URL | |
movie_data = [] #Create a list to store the JSON strings | |
for i in OMDB_URLs: #For each URL, get the matching JSON string | |
URL = urllib2.urlopen(i) #Open the URL | |
soup = bs(URL) #Save it as a soup object | |
movie_data.append(soup.p.string) #Append the JSON string to the list | |
return movie_data #Return the list of data | |
''' | |
Next, define a function that will take the JSON data string and parse it into a dictionary | |
for each of the fields. | |
''' | |
def Extract_Movie_Data(JSON_Data): #The function takes a JSON string | |
Element_String = str(JSON_Data[1:-1]).split('"') #Split the string based on quotation marks | |
Element_String = [i.replace('"','') for i in Element_String] #Delete double quotes | |
Test_Cleaned_Element_String = [] #Create empty list | |
for element in Element_String: #Get rid of commas and colons | |
if element is ',' or element is ':' or element is '': | |
continue | |
Test_Cleaned_Element_String.append(element) | |
Cleaned = Test_Cleaned_Element_String[0:40] #Only keep the first 40 elements (ignore repeats) | |
Cleaned_Keys = [] | |
Cleaned_Values = [] | |
for i in range(len(Cleaned)): #Since keys and values alternate order, we can parse based on every other element (odd vs. even) | |
if i%2 != 0: | |
Cleaned_Values.append(Cleaned[i]) | |
else: | |
Cleaned_Keys.append(Cleaned[i]) | |
Lists_for_Dict = zip(Cleaned_Keys,Cleaned_Values) #Zip the lists of keys and values together as tuples | |
Movie_Data_Dict = dict(Lists_for_Dict) #Convert the tuples into dictionary key, value pairs | |
return Movie_Data_Dict #Return the dictionary | |
''' | |
Next, use the functions that were just defined to pull the movie data. | |
''' | |
Movie_Titles = Get_Movie_Titles(5000) #Get the top 5000 movie titles from IMDB | |
Clean_Movie_Titles = Movie_Titles | |
Movie_Data = Get_Movie_Data(Movie_Titles) #Pull the data from each of those titles | |
Combined_Movie_List = [] #Create an empty list to hold title, year, box office earnings and JSON data together | |
for i in range(len(Movie_Titles)): #Populate the list | |
Combined_Movie_List.append([Clean_Movie_Titles[i][0], Clean_Movie_Titles[i][1], Clean_Movie_Titles[i][2], Movie_Data[i]]) | |
JSON_List = [i[3] for i in Combined_Movie_List] #Create list of just the JSON data | |
Movie_List_of_Dict = [] | |
for i in JSON_List: #Use the function previously defined to create a dictionary for all JSON strings | |
movie_data = Extract_Movie_Data(i) | |
Movie_List_of_Dict.append(movie_data) #Place all of the newly created dictionaries into a list | |
Movie_Dict_Keys = Movie_List_of_Dict[0].keys() #Store all of the dictionary key values as a list | |
Movie_Dict = {} | |
Box_Office_Dict = {} | |
for i in range(len(Clean_Movie_Titles)): #Create a dictionary of all of the box office values, using the movie title as a key | |
Box_Office_Dict[Clean_Movie_Titles[i][0]] = Clean_Movie_Titles[i][2] | |
for i in range(len(Clean_Movie_Titles)): #Create a dictionary of all of the dictionaries (from JSON), using the movie title as a key | |
Movie_Dict[Clean_Movie_Titles[i][0]] = Movie_List_of_Dict[i] | |
Movie_DF = pd.DataFrame(Movie_Dict, index = Movie_Dict_Keys) #Create data frame out of dictionaries | |
Movie_DF = Movie_DF.append(Box_Office_Dict, ignore_index = True) #Add box office earnings info | |
Movie_DF = Movie_DF.T #Transpose the data frame to make it more manageable | |
columns = Movie_Dict_Keys #Create column names out of the dictionary keys | |
columns.append('Box Office Earnings') #Add 'Box Office Earnings' to column header list | |
columns_range = range(len(columns)) | |
columns_dict = {} | |
for i in range(len(columns)): #Create dictionary to match old column headers with new column headers | |
columns_dict[columns_range[i]] = columns[i] | |
Movie_DF = Movie_DF.rename(columns = columns_dict) #Rename the column headers | |
Movie_DF.to_csv('Movie_DF_11_20_2014.csv') #Save the data frame to a CSV file for future use |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment