scrapehero/fandango.py

## fandango.py
from selenium.webdriver.common.keys import Keys
from time import sleep
import unicodecsv as csv
import argparse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException


def parse(location, showdate):
    searchedZipcode = location
    searchedDate = showdate
    url = 'https://www.fandango.com'
    response = webdriver.Firefox()
    response.get(url)
    sleep(10)
    flag = False

    try:

        XPATH_ZIP_INPUT = '//input[contains(@class,"search")]'
        XPATH_THEATERS = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"]'
        DATES = '//ul[@id="scroll-date-picker__list"]//li'

        zip_code = response.find_elements_by_xpath(XPATH_ZIP_INPUT)
        zip_code[0].send_keys(searchedZipcode)
        zip_code[0].send_keys(Keys.ENTER)
        sleep(10)

        movie_listings = []

        dates = response.find_elements_by_xpath(DATES)
        for date in dates:
            show_date = date.get_attribute("data-show-time-date")
            if show_date == searchedDate:
                flag = True
                date.click()
                sleep(10)

                theaters = response.find_elements_by_xpath(XPATH_THEATERS)
                total_theaters = len(theaters)

                if theaters:
                    # Iterating through each theater from the results
                    for i in range(1, total_theaters + 1):
                        XPATH_THEATERAME = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str(
                            i) + ']//div[@class="fd-theater__header"]//a[@class="light"]'
                        XPATH_THEATER_ADRESS = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str(
                            i) + ']//div[@class="fd-theater__header"]//div[@class="fd-theater__address-wrap"]'
                        MOVIES = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + \
                            str(i) + ']//li[@class="fd-movie"]'

                        raw_theater_name = response.find_elements_by_xpath(XPATH_THEATER_NAME)
                        raw_theater_address = response.find_elements_by_xpath(XPATH_THEATER_ADDRESS)
                        movies = response.find_elements_by_xpath(MOVIES)

                        # cleaning data
                        theater_name = ' '.join(
                            ''.join(raw_theater_name[0].text).split()) if raw_theater_name else None
                        theater_address = ' '.join(
                            ''.join(raw_theater_address[0].text).split()) if raw_theater_address else None
                        total_movies = len(movies)

                        if movies:
                            # Iterating through each movies in a theater
                            for j in range(1, total_movies + 1):
                                XPATH_MOVIE_NAME = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str(
                                    i) + ']//li[@class="fd-movie"][' + str(j) + ']//div[@class="fd-movie__details"]//a[@class="dark"]'
                                XPATH_RATING = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str(i) + ']//li[@class="fd-movie"][' + str(
                                    j) + ']//div[@class="fd-movie__details"]//div[@class="js-fd-star-rating fd-star-rating "]'
                                XPATH_HOURS_GENRE = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str(
                                    i) + ']//li[@class="fd-movie"][' + str(j) + ']//div[@class="fd-movie__details"]//p[@class="fd-movie__rating-runtime"]'
                                XPATH_SHOW_TIMES = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str(
                                    i) + ']//li[@class="fd-movie"][' + str(j) + ']//li[@class="fd-movie__btn-list-item"]/a'

                                raw_movie_name = response.find_elements_by_xpath(XPATH_MOVIE_NAME)
                                raw_rating = response.find_elements_by_xpath(XPATH_RATING)

                                # cleaning data
                                movie_name = ' '.join(
                                    ''.join(raw_movie_name[0].text).split()) if raw_movie_name else None
                                rating = ' '.join(''.join(raw_rating[0].get_attribute(
                                    "data-star-rating")).split()) if raw_rating else None

                                if rating:
                                    raw_hours_and_genre = response.find_elements_by_xpath(
                                        XPATH_HOURS_GENRE)
                                    raw_hours_and_genre = ''.join(''.join(raw_hours_and_genre[0].get_attribute(
                                        'innerHTML'))).split('<br>') if raw_hours_and_genre else None

                                    # cleaning data
                                    if raw_hours_and_genre:
                                        if ',' in raw_hours_and_genre[0]:
                                            movie_rating = raw_hours_and_genre[0].split(',')[0].strip()
                                            hours = raw_hours_and_genre[0].split(',')[1].strip()
                                            genre = raw_hours_and_genre[1].strip()
                                        elif 'min' in raw_hours_and_genre[0]:
                                            hours = raw_hours_and_genre[0].strip()
                                            genre = raw_hours_and_genre[1].strip()
                                            movie_rating = None
                                        else:
                                            hours = None
                                            genre = None
                                            movie_rating = None
                                    else:
                                        hours = None
                                        genre = None
                                        movie_rating = None

                                    movie_data = {
                                        "Theater_Name": theater_name,
                                        "Theater_Address": theater_address,
                                        "Movie_Name": movie_name,
                                        "Show_Date": show_date,
                                        "Movie_Rating" :movie_rating,
                                        "Star_Rating": rating,
                                        "Duration": hours,
                                        "Genre": genre,
                                        "Location_or_Zipcode": searchedZipcode
                                    }
                                    movie_listings.append(movie_data)

                                else:
                                    print "This is not a movie"
                        else:
                            print "No movies in %s theater" % (theater_name)
                else:
                    print "No theaters found"
            return movie_listings

        if not flag:
            print "Searched date not found in website"
    except:
        print "Failed to load movies"

if __name__ == "__main__":

    ''' eg-:python 2031_fandango.py 20001 2017-11-29 '''

    argparser = argparse.ArgumentParser()
    argparser.add_argument('location', help='movie location', type=str)
    argparser.add_argument('showdate', help='movie show time', type=str)
    args = argparser.parse_args()
    location = args.location
    showdate = args.showdate
    print "Fetching movie details"
    scraped_data = parse(location, showdate)
    print "Writing data to output file"

    with open('%s-%s-movie-results.csv' % (location, showdate), 'w')as csvfile:
        fieldnames = ['Theater_Name', 'Theater_Address', 'Movie_Name',
                      'Show_Date', 'Location_or_Zipcode', 'Duration', 'Genre', 'Movie_Rating','Star_Rating']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
        writer.writeheader()
        if scraped_data:
            for data in scraped_data:
                writer.writerow(data)
        else:
            print "Your search for %s, in %s does not match any movies" % (location, showdate)
	from selenium.webdriver.common.keys import Keys
	from time import sleep
	import unicodecsv as csv
	import argparse
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.common.exceptions import TimeoutException


	def parse(location, showdate):
	searchedZipcode = location
	searchedDate = showdate
	url = 'https://www.fandango.com'
	response = webdriver.Firefox()
	response.get(url)
	sleep(10)
	flag = False

	try:

	XPATH_ZIP_INPUT = '//input[contains(@class,"search")]'
	XPATH_THEATERS = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"]'
	DATES = '//ul[@id="scroll-date-picker__list"]//li'

	zip_code = response.find_elements_by_xpath(XPATH_ZIP_INPUT)
	zip_code[0].send_keys(searchedZipcode)
	zip_code[0].send_keys(Keys.ENTER)
	sleep(10)

	movie_listings = []

	dates = response.find_elements_by_xpath(DATES)
	for date in dates:
	show_date = date.get_attribute("data-show-time-date")
	if show_date == searchedDate:
	flag = True
	date.click()
	sleep(10)

	theaters = response.find_elements_by_xpath(XPATH_THEATERS)
	total_theaters = len(theaters)

	if theaters:
	# Iterating through each theater from the results
	for i in range(1, total_theaters + 1):
	XPATH_THEATERAME = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str(
	i) + ']//div[@class="fd-theater__header"]//a[@class="light"]'
	XPATH_THEATER_ADRESS = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str(
	i) + ']//div[@class="fd-theater__header"]//div[@class="fd-theater__address-wrap"]'
	MOVIES = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + \
	str(i) + ']//li[@class="fd-movie"]'

	raw_theater_name = response.find_elements_by_xpath(XPATH_THEATER_NAME)
	raw_theater_address = response.find_elements_by_xpath(XPATH_THEATER_ADDRESS)
	movies = response.find_elements_by_xpath(MOVIES)

	# cleaning data
	theater_name = ' '.join(
	''.join(raw_theater_name[0].text).split()) if raw_theater_name else None
	theater_address = ' '.join(
	''.join(raw_theater_address[0].text).split()) if raw_theater_address else None
	total_movies = len(movies)

	if movies:
	# Iterating through each movies in a theater
	for j in range(1, total_movies + 1):
	XPATH_MOVIE_NAME = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str(
	i) + ']//li[@class="fd-movie"][' + str(j) + ']//div[@class="fd-movie__details"]//a[@class="dark"]'
	XPATH_RATING = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str(i) + ']//li[@class="fd-movie"][' + str(
	j) + ']//div[@class="fd-movie__details"]//div[@class="js-fd-star-rating fd-star-rating "]'
	XPATH_HOURS_GENRE = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str(
	i) + ']//li[@class="fd-movie"][' + str(j) + ']//div[@class="fd-movie__details"]//p[@class="fd-movie__rating-runtime"]'
	XPATH_SHOW_TIMES = '//div[contains(@class,"fd-showtimes")]//li[@class="fd-theater"][' + str(
	i) + ']//li[@class="fd-movie"][' + str(j) + ']//li[@class="fd-movie__btn-list-item"]/a'

	raw_movie_name = response.find_elements_by_xpath(XPATH_MOVIE_NAME)
	raw_rating = response.find_elements_by_xpath(XPATH_RATING)

	# cleaning data
	movie_name = ' '.join(
	''.join(raw_movie_name[0].text).split()) if raw_movie_name else None
	rating = ' '.join(''.join(raw_rating[0].get_attribute(
	"data-star-rating")).split()) if raw_rating else None

	if rating:
	raw_hours_and_genre = response.find_elements_by_xpath(
	XPATH_HOURS_GENRE)
	raw_hours_and_genre = ''.join(''.join(raw_hours_and_genre[0].get_attribute(
	'innerHTML'))).split('<br>') if raw_hours_and_genre else None

	# cleaning data
	if raw_hours_and_genre:
	if ',' in raw_hours_and_genre[0]:
	movie_rating = raw_hours_and_genre[0].split(',')[0].strip()
	hours = raw_hours_and_genre[0].split(',')[1].strip()
	genre = raw_hours_and_genre[1].strip()
	elif 'min' in raw_hours_and_genre[0]:
	hours = raw_hours_and_genre[0].strip()
	genre = raw_hours_and_genre[1].strip()
	movie_rating = None
	else:
	hours = None
	genre = None
	movie_rating = None
	else:
	hours = None
	genre = None
	movie_rating = None

	movie_data = {
	"Theater_Name": theater_name,
	"Theater_Address": theater_address,
	"Movie_Name": movie_name,
	"Show_Date": show_date,
	"Movie_Rating" :movie_rating,
	"Star_Rating": rating,
	"Duration": hours,
	"Genre": genre,
	"Location_or_Zipcode": searchedZipcode
	}
	movie_listings.append(movie_data)

	else:
	print "This is not a movie"
	else:
	print "No movies in %s theater" % (theater_name)
	else:
	print "No theaters found"
	return movie_listings

	if not flag:
	print "Searched date not found in website"
	except:
	print "Failed to load movies"

	if __name__ == "__main__":

	''' eg-:python 2031_fandango.py 20001 2017-11-29 '''

	argparser = argparse.ArgumentParser()
	argparser.add_argument('location', help='movie location', type=str)
	argparser.add_argument('showdate', help='movie show time', type=str)
	args = argparser.parse_args()
	location = args.location
	showdate = args.showdate
	print "Fetching movie details"
	scraped_data = parse(location, showdate)
	print "Writing data to output file"

	with open('%s-%s-movie-results.csv' % (location, showdate), 'w')as csvfile:
	fieldnames = ['Theater_Name', 'Theater_Address', 'Movie_Name',
	'Show_Date', 'Location_or_Zipcode', 'Duration', 'Genre', 'Movie_Rating','Star_Rating']
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
	writer.writeheader()
	if scraped_data:
	for data in scraped_data:
	writer.writerow(data)
	else:
	print "Your search for %s, in %s does not match any movies" % (location, showdate)