Skip to content

Instantly share code, notes, and snippets.

Last active March 8, 2018 21:22
  • Star 0 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save scrapehero/963aa61f6057cfc23d5fad1d069df555 to your computer and use it in GitHub Desktop.
Scraper to extract movie details and showtimes from
from selenium.webdriver.common.keys import Keys
from time import sleep
import unicodecsv as csv
import argparse
from selenium import webdriver
from import By
from import WebDriverWait
from import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def parse(location, showdate):
searchedZipcode = location
searchedDate = showdate
url = ''
response = webdriver.Firefox()
flag = False
XPATH_ZIP_INPUT = '//input[contains(@class,"search")]'
XPATH_THEATERS = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"]'
DATES = '//ul[@id="scroll-date-picker__list"]//li'
zip_code = response.find_elements_by_xpath(XPATH_ZIP_INPUT)
movie_listings = []
dates = response.find_elements_by_xpath(DATES)
for date in dates:
show_date = date.get_attribute("data-show-time-date")
if show_date == searchedDate:
flag = True
theaters = response.find_elements_by_xpath(XPATH_THEATERS)
total_theaters = len(theaters)
if theaters:
# Iterating through each theater from the results
for i in range(1, total_theaters + 1):
XPATH_THEATERAME = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str(
i) + ']//div[@class="fd-theater__header"]//a[@class="light"]'
XPATH_THEATER_ADRESS = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str(
i) + ']//div[@class="fd-theater__header"]//div[@class="fd-theater__address-wrap"]'
MOVIES = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + \
str(i) + ']//li[@class="fd-movie"]'
raw_theater_name = response.find_elements_by_xpath(XPATH_THEATER_NAME)
raw_theater_address = response.find_elements_by_xpath(XPATH_THEATER_ADDRESS)
movies = response.find_elements_by_xpath(MOVIES)
# cleaning data
theater_name = ' '.join(
''.join(raw_theater_name[0].text).split()) if raw_theater_name else None
theater_address = ' '.join(
''.join(raw_theater_address[0].text).split()) if raw_theater_address else None
total_movies = len(movies)
if movies:
# Iterating through each movies in a theater
for j in range(1, total_movies + 1):
XPATH_MOVIE_NAME = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str(
i) + ']//li[@class="fd-movie"][' + str(j) + ']//div[@class="fd-movie__details"]//a[@class="dark"]'
XPATH_RATING = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str(i) + ']//li[@class="fd-movie"][' + str(
j) + ']//div[@class="fd-movie__details"]//div[@class="js-fd-star-rating fd-star-rating "]'
XPATH_HOURS_GENRE = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str(
i) + ']//li[@class="fd-movie"][' + str(j) + ']//div[@class="fd-movie__details"]//p[@class="fd-movie__rating-runtime"]'
XPATH_SHOW_TIMES = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str(
i) + ']//li[@class="fd-movie"][' + str(j) + ']//li[@class="fd-movie__btn-list-item"]/a'
raw_movie_name = response.find_elements_by_xpath(XPATH_MOVIE_NAME)
raw_rating = response.find_elements_by_xpath(XPATH_RATING)
# cleaning data
movie_name = ' '.join(
''.join(raw_movie_name[0].text).split()) if raw_movie_name else None
rating = ' '.join(''.join(raw_rating[0].get_attribute(
"data-star-rating")).split()) if raw_rating else None
if rating:
raw_hours_and_genre = response.find_elements_by_xpath(
raw_hours_and_genre = ''.join(''.join(raw_hours_and_genre[0].get_attribute(
'innerHTML'))).split('<br>') if raw_hours_and_genre else None
# cleaning data
if raw_hours_and_genre:
if ',' in raw_hours_and_genre[0]:
movie_rating = raw_hours_and_genre[0].split(',')[0].strip()
hours = raw_hours_and_genre[0].split(',')[1].strip()
genre = raw_hours_and_genre[1].strip()
elif 'min' in raw_hours_and_genre[0]:
hours = raw_hours_and_genre[0].strip()
genre = raw_hours_and_genre[1].strip()
movie_rating = None
hours = None
genre = None
movie_rating = None
hours = None
genre = None
movie_rating = None
movie_data = {
"Theater_Name": theater_name,
"Theater_Address": theater_address,
"Movie_Name": movie_name,
"Show_Date": show_date,
"Movie_Rating" :movie_rating,
"Star_Rating": rating,
"Duration": hours,
"Genre": genre,
"Location_or_Zipcode": searchedZipcode
print "This is not a movie"
print "No movies in %s theater" % (theater_name)
print "No theaters found"
return movie_listings
if not flag:
print "Searched date not found in website"
print "Failed to load movies"
if __name__ == "__main__":
''' eg-:python 20001 2017-11-29 '''
argparser = argparse.ArgumentParser()
argparser.add_argument('location', help='movie location', type=str)
argparser.add_argument('showdate', help='movie show time', type=str)
args = argparser.parse_args()
location = args.location
showdate = args.showdate
print "Fetching movie details"
scraped_data = parse(location, showdate)
print "Writing data to output file"
with open('%s-%s-movie-results.csv' % (location, showdate), 'w')as csvfile:
fieldnames = ['Theater_Name', 'Theater_Address', 'Movie_Name',
'Show_Date', 'Location_or_Zipcode', 'Duration', 'Genre', 'Movie_Rating','Star_Rating']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
if scraped_data:
for data in scraped_data:
print "Your search for %s, in %s does not match any movies" % (location, showdate)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment