Skip to content

Instantly share code, notes, and snippets.

@cflint12
Created December 7, 2014 04:58
Show Gist options
  • Save cflint12/17be988516618b96af05 to your computer and use it in GitHub Desktop.
Save cflint12/17be988516618b96af05 to your computer and use it in GitHub Desktop.
This program was generated to scrape movie data from the IMDB website in conjunction with omdbapi.com
'''
This program was generated to scrape movie data from the IMDB website
in conjunction with omdbapi.com
'''
#Import the necessary libraries
import urllib2
from bs4 import BeautifulSoup as bs
import pandas as pd
import os
#Change directory to the folder you would like to store everything in:
#os.chdir('Set working directory here ...')
'''
Define a function to scrape a list of movie titles from IMDB.
The list should also contain the year the movie was released
and its box office earnings (in millions of US dollars).
The input to the function is the number of titles desired.
The function pulls the movie titles in descending order of box office earnings.
'''
def Get_Movie_Titles(number_of_titles):
IMDB_URLs = [] #Create empty list to receive IMDB URLs (each URL accounts for 100 movie titles)
increments = range(1,number_of_titles,100) #Create index set for the number of movie titles requested
for i in increments: #Generate a URL for each set of 100 movie titles to be scraped
URL = 'http://www.imdb.com/search/title?at=0&count=100&sort=boxoffice_gross_us&start=' + str(i) + '&title_type=feature,tv_series,tv_movie'
IMDB_URLs.append(URL) #Place the URLs into a list
#Create empty lists for each of the fields we would like to collect
BoxOfficeList = []
titles_preprocess = [] #This list will store information from all a tags that contain an attribute 'title', it will then feed into more processing
titles = [] #This list will store all of the legitimate titles, as pulled from titles_preprocess
cleaned_titles = [] #This list will separate the movie titles from the years and prepare to format the titles to be placed into the required omdbapi.com format to pull additional JSON data about the movie
for URL in IMDB_URLs: #This loop pulls titles from each of the URLs generated in IMDB_URLs
Input_URL = urllib2.urlopen(URL) #Open the URL
soup = bs(Input_URL) #Store the opened URL as a soup object
allTextA = soup.findAll('a') #Find and store all of the 'a' tags, which contain the titles
allTextBoxOffice = soup.findAll('td', { "class" : "sort_col" }) #Find all of the TD tags that include the class that was used to sort (Box Office earnings)
for i in allTextBoxOffice: #Store all of the sort_col strings (box office earnings)
BoxOfficeList.append(i.string)
for a in allTextA: #Store all of the href title attributes in the list titles_preprocess
if '/title/' in a['href'] and a.has_attr('title'):
titles_preprocess.append(a)
for i in titles_preprocess: #Sort through all of the titles and only keep movie titles in the list "titles"
if i['title'] == 'Register or login to rate this title' or i['title'] == 'Delete' or i['title'] == 'Home':
continue
if 'TV Series' in i['title']:
continue
titles.append(i['title'])
for title in titles: #Clean the movie titles by separating out the year and taking away parentheses and blank spaces
title = title.rsplit('(',1) #Split the year from the title
title[0] = title[0].strip() #Strip extra spaces
title[0] = title[0].replace('\xe9','e') #Replace accented e with e
title[0] = title[0].replace('\xfc','u') #Replace omlatted u with u
title[1] = title[1][0:4]
cleaned_titles.append(title)
for i in range(len(cleaned_titles)): #Combine the cleaned titles and years with the box office earnings
cleaned_titles[i].append(BoxOfficeList[i])
return cleaned_titles #Return a list of lists, which contains the title, year and box office earnings for each movie
'''
Next, define a function to utilize the list of cleaned movie titles and years
to pull additional data (IMDB ratings, release date, actors, director, awards, rating, etc.)
in the form of a JSON string from omdbapi.com (Open Movie Database)
The function reformats the titles to place them into URLs compatible with the site,
then pulls the data generated from each URL into a new list (movie_data), which it returns.
'''
def Get_Movie_Data(cleaned_titles):
OMDB_URLs = [] #Create empty list to store a URL for each movie, from which the JSON data will be pulled
for title in cleaned_titles: #Reformat each of the titles so that it can be placed into the URLs
title[0] = title[0].replace('&','%26') #Replace & in the titles with the correct format for the URL
title[0] = title[0].replace(':','%3A') #Replace : in the titles with the correct format for the URL
scraped_title = title[0] = title[0].replace(' ','+') #replace spaces with + signs
scraped_year = title[1] #Store the year released
OMDB_URL = 'http://www.omdbapi.com/?t=' + scraped_title + '&y=' + scraped_year + '&plot=short&r=json' #Generate the URL
OMDB_URLs.append(OMDB_URL) #Store the URL
movie_data = [] #Create a list to store the JSON strings
for i in OMDB_URLs: #For each URL, get the matching JSON string
URL = urllib2.urlopen(i) #Open the URL
soup = bs(URL) #Save it as a soup object
movie_data.append(soup.p.string) #Append the JSON string to the list
return movie_data #Return the list of data
'''
Next, define a function that will take the JSON data string and parse it into a dictionary
for each of the fields.
'''
def Extract_Movie_Data(JSON_Data): #The function takes a JSON string
Element_String = str(JSON_Data[1:-1]).split('"') #Split the string based on quotation marks
Element_String = [i.replace('"','') for i in Element_String] #Delete double quotes
Test_Cleaned_Element_String = [] #Create empty list
for element in Element_String: #Get rid of commas and colons
if element is ',' or element is ':' or element is '':
continue
Test_Cleaned_Element_String.append(element)
Cleaned = Test_Cleaned_Element_String[0:40] #Only keep the first 40 elements (ignore repeats)
Cleaned_Keys = []
Cleaned_Values = []
for i in range(len(Cleaned)): #Since keys and values alternate order, we can parse based on every other element (odd vs. even)
if i%2 != 0:
Cleaned_Values.append(Cleaned[i])
else:
Cleaned_Keys.append(Cleaned[i])
Lists_for_Dict = zip(Cleaned_Keys,Cleaned_Values) #Zip the lists of keys and values together as tuples
Movie_Data_Dict = dict(Lists_for_Dict) #Convert the tuples into dictionary key, value pairs
return Movie_Data_Dict #Return the dictionary
'''
Next, use the functions that were just defined to pull the movie data.
'''
Movie_Titles = Get_Movie_Titles(5000) #Get the top 5000 movie titles from IMDB
Clean_Movie_Titles = Movie_Titles
Movie_Data = Get_Movie_Data(Movie_Titles) #Pull the data from each of those titles
Combined_Movie_List = [] #Create an empty list to hold title, year, box office earnings and JSON data together
for i in range(len(Movie_Titles)): #Populate the list
Combined_Movie_List.append([Clean_Movie_Titles[i][0], Clean_Movie_Titles[i][1], Clean_Movie_Titles[i][2], Movie_Data[i]])
JSON_List = [i[3] for i in Combined_Movie_List] #Create list of just the JSON data
Movie_List_of_Dict = []
for i in JSON_List: #Use the function previously defined to create a dictionary for all JSON strings
movie_data = Extract_Movie_Data(i)
Movie_List_of_Dict.append(movie_data) #Place all of the newly created dictionaries into a list
Movie_Dict_Keys = Movie_List_of_Dict[0].keys() #Store all of the dictionary key values as a list
Movie_Dict = {}
Box_Office_Dict = {}
for i in range(len(Clean_Movie_Titles)): #Create a dictionary of all of the box office values, using the movie title as a key
Box_Office_Dict[Clean_Movie_Titles[i][0]] = Clean_Movie_Titles[i][2]
for i in range(len(Clean_Movie_Titles)): #Create a dictionary of all of the dictionaries (from JSON), using the movie title as a key
Movie_Dict[Clean_Movie_Titles[i][0]] = Movie_List_of_Dict[i]
Movie_DF = pd.DataFrame(Movie_Dict, index = Movie_Dict_Keys) #Create data frame out of dictionaries
Movie_DF = Movie_DF.append(Box_Office_Dict, ignore_index = True) #Add box office earnings info
Movie_DF = Movie_DF.T #Transpose the data frame to make it more manageable
columns = Movie_Dict_Keys #Create column names out of the dictionary keys
columns.append('Box Office Earnings') #Add 'Box Office Earnings' to column header list
columns_range = range(len(columns))
columns_dict = {}
for i in range(len(columns)): #Create dictionary to match old column headers with new column headers
columns_dict[columns_range[i]] = columns[i]
Movie_DF = Movie_DF.rename(columns = columns_dict) #Rename the column headers
Movie_DF.to_csv('Movie_DF_11_20_2014.csv') #Save the data frame to a CSV file for future use
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment