Last active
March 8, 2018 21:22
-
-
Save scrapehero/963aa61f6057cfc23d5fad1d069df555 to your computer and use it in GitHub Desktop.
Scraper to extract movie details and showtimes from Fandango.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium.webdriver.common.keys import Keys | |
from time import sleep | |
import unicodecsv as csv | |
import argparse | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.common.exceptions import TimeoutException | |
def parse(location, showdate): | |
searchedZipcode = location | |
searchedDate = showdate | |
url = 'https://www.fandango.com' | |
response = webdriver.Firefox() | |
response.get(url) | |
sleep(10) | |
flag = False | |
try: | |
XPATH_ZIP_INPUT = '//input[contains(@class,"search")]' | |
XPATH_THEATERS = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"]' | |
DATES = '//ul[@id="scroll-date-picker__list"]//li' | |
zip_code = response.find_elements_by_xpath(XPATH_ZIP_INPUT) | |
zip_code[0].send_keys(searchedZipcode) | |
zip_code[0].send_keys(Keys.ENTER) | |
sleep(10) | |
movie_listings = [] | |
dates = response.find_elements_by_xpath(DATES) | |
for date in dates: | |
show_date = date.get_attribute("data-show-time-date") | |
if show_date == searchedDate: | |
flag = True | |
date.click() | |
sleep(10) | |
theaters = response.find_elements_by_xpath(XPATH_THEATERS) | |
total_theaters = len(theaters) | |
if theaters: | |
# Iterating through each theater from the results | |
for i in range(1, total_theaters + 1): | |
XPATH_THEATERAME = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str( | |
i) + ']//div[@class="fd-theater__header"]//a[@class="light"]' | |
XPATH_THEATER_ADRESS = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str( | |
i) + ']//div[@class="fd-theater__header"]//div[@class="fd-theater__address-wrap"]' | |
MOVIES = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + \ | |
str(i) + ']//li[@class="fd-movie"]' | |
raw_theater_name = response.find_elements_by_xpath(XPATH_THEATER_NAME) | |
raw_theater_address = response.find_elements_by_xpath(XPATH_THEATER_ADDRESS) | |
movies = response.find_elements_by_xpath(MOVIES) | |
# cleaning data | |
theater_name = ' '.join( | |
''.join(raw_theater_name[0].text).split()) if raw_theater_name else None | |
theater_address = ' '.join( | |
''.join(raw_theater_address[0].text).split()) if raw_theater_address else None | |
total_movies = len(movies) | |
if movies: | |
# Iterating through each movies in a theater | |
for j in range(1, total_movies + 1): | |
XPATH_MOVIE_NAME = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str( | |
i) + ']//li[@class="fd-movie"][' + str(j) + ']//div[@class="fd-movie__details"]//a[@class="dark"]' | |
XPATH_RATING = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str(i) + ']//li[@class="fd-movie"][' + str( | |
j) + ']//div[@class="fd-movie__details"]//div[@class="js-fd-star-rating fd-star-rating "]' | |
XPATH_HOURS_GENRE = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str( | |
i) + ']//li[@class="fd-movie"][' + str(j) + ']//div[@class="fd-movie__details"]//p[@class="fd-movie__rating-runtime"]' | |
XPATH_SHOW_TIMES = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str( | |
i) + ']//li[@class="fd-movie"][' + str(j) + ']//li[@class="fd-movie__btn-list-item"]/a' | |
raw_movie_name = response.find_elements_by_xpath(XPATH_MOVIE_NAME) | |
raw_rating = response.find_elements_by_xpath(XPATH_RATING) | |
# cleaning data | |
movie_name = ' '.join( | |
''.join(raw_movie_name[0].text).split()) if raw_movie_name else None | |
rating = ' '.join(''.join(raw_rating[0].get_attribute( | |
"data-star-rating")).split()) if raw_rating else None | |
if rating: | |
raw_hours_and_genre = response.find_elements_by_xpath( | |
XPATH_HOURS_GENRE) | |
raw_hours_and_genre = ''.join(''.join(raw_hours_and_genre[0].get_attribute( | |
'innerHTML'))).split('<br>') if raw_hours_and_genre else None | |
# cleaning data | |
if raw_hours_and_genre: | |
if ',' in raw_hours_and_genre[0]: | |
movie_rating = raw_hours_and_genre[0].split(',')[0].strip() | |
hours = raw_hours_and_genre[0].split(',')[1].strip() | |
genre = raw_hours_and_genre[1].strip() | |
elif 'min' in raw_hours_and_genre[0]: | |
hours = raw_hours_and_genre[0].strip() | |
genre = raw_hours_and_genre[1].strip() | |
movie_rating = None | |
else: | |
hours = None | |
genre = None | |
movie_rating = None | |
else: | |
hours = None | |
genre = None | |
movie_rating = None | |
movie_data = { | |
"Theater_Name": theater_name, | |
"Theater_Address": theater_address, | |
"Movie_Name": movie_name, | |
"Show_Date": show_date, | |
"Movie_Rating" :movie_rating, | |
"Star_Rating": rating, | |
"Duration": hours, | |
"Genre": genre, | |
"Location_or_Zipcode": searchedZipcode | |
} | |
movie_listings.append(movie_data) | |
else: | |
print "This is not a movie" | |
else: | |
print "No movies in %s theater" % (theater_name) | |
else: | |
print "No theaters found" | |
return movie_listings | |
if not flag: | |
print "Searched date not found in website" | |
except: | |
print "Failed to load movies" | |
if __name__ == "__main__": | |
''' eg-:python 2031_fandango.py 20001 2017-11-29 ''' | |
argparser = argparse.ArgumentParser() | |
argparser.add_argument('location', help='movie location', type=str) | |
argparser.add_argument('showdate', help='movie show time', type=str) | |
args = argparser.parse_args() | |
location = args.location | |
showdate = args.showdate | |
print "Fetching movie details" | |
scraped_data = parse(location, showdate) | |
print "Writing data to output file" | |
with open('%s-%s-movie-results.csv' % (location, showdate), 'w')as csvfile: | |
fieldnames = ['Theater_Name', 'Theater_Address', 'Movie_Name', | |
'Show_Date', 'Location_or_Zipcode', 'Duration', 'Genre', 'Movie_Rating','Star_Rating'] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL) | |
writer.writeheader() | |
if scraped_data: | |
for data in scraped_data: | |
writer.writerow(data) | |
else: | |
print "Your search for %s, in %s does not match any movies" % (location, showdate) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment