Skip to content

Instantly share code, notes, and snippets.

@chadmhorner
Created January 30, 2019 01:07
Show Gist options
  • Save chadmhorner/fe990321ee54e2eed5f24b4203751af5 to your computer and use it in GitHub Desktop.
Save chadmhorner/fe990321ee54e2eed5f24b4203751af5 to your computer and use it in GitHub Desktop.
from readypipe import requests, starting_task, subtask, schedule, save, schedule_many, open_browser
from json import loads
from lxml import etree
import datetime
from pandas import read_csv
from requests.exceptions import ConnectTimeout
SEARCH_URL = 'https://www.fandango.com/%s_movietimes?mode=general&q=%s'
JSON_ENDPOINT = 'https://www.fandango.com/napi/theaterswithshowtimes?zipCode=%s&city=&state=&date=%s&page=1&favTheaterOnly=false&limit=10&isdesktop=true'
@starting_task
def load_zipcodes():
zipcodes = read_csv('ZIP.csv', converters={'ZipCode_five': lambda x: str(x)}) #this is a CSV of zipcodes
zipcode_five = zipcodes['ZipCode_five'].tolist()
to_load = []
for zipcode in zipcode_five: #we are only going to scrape zip codes ending in 1 (because theaters in neighboring zip codes will be found in search)
if zipcode[-1] == "1":
to_load.append((zipcode, )) #add tuple
schedule_many('sweep_theaters',to_load)
@subtask
def sweep_theaters(zipcode):
today = datetime.date.today().strftime('%Y-%m-%d')
with open_browser() as browser: #use headless browser
browser.goto('https://www.fandango.com') #visit homepage
browser.goto(SEARCH_URL % (zipcode, zipcode)) #visit search page
browser.goto(JSON_ENDPOINT % (zipcode, today)) #load JSON endpoint
content = browser.content()
dom = etree.HTML(content)
try:
js = loads(dom.xpath('//pre')[0].text)
except IndexError:
return
theaters = js.get('theaters', [])
for theater in theaters: #save theater data
_id = theater.get('id')
_address = theater.get('address1')
_chain = theater.get('chainName')
_city = theater.get('city')
_latitude = theater.get('geo').get('latitude')
_longitude = theater.get('geo').get('longitude')
_name = theater.get('name')
_name_slug = theater.get('sluggedName')
_url = theater.get('theaterPageUrl')
_state = theater.get('state')
_zip = theater.get('zip')
theater_data = {}
theater_data = {
'theater_id': _id,
'address': _address,
'chain': _chain,
'city': _city,
'latitude': _latitude,
'longitude': _longitude,
'name': _name,
'name_slug': _name_slug,
'url': _url,
'state': _state,
'zip': _zip
}
save('theaters', theater_data)
movies = theater.get('movies', [])
for movie in movies: #save movie data
_movie_id = movie.get('id')
_movie_name = movie.get('title')
variants = movie.get('variants', [])
for variant in variants:
_movie_format = variant.get('format')
amenity_groups = variant.get('amenityGroups', [])
for amenity in amenity_groups:
showtimes = amenity.get('showtimes', [])
if len(showtimes) > 0:
final_showtime = showtimes[-1]
_movie_time = final_showtime.get('date')
_movie_ticketing_url = final_showtime.get('ticketingUrl')
try:
ticketing_page = requests.get_dom_from_content(_movie_ticketing_url)
ticket_classes = ticketing_page.xpath('//*/input[@name="pricedesc"]')
ticket_prices = ticketing_page.xpath('//*/input[@name="price"]')
for i in range(len(ticket_classes)):
if ticket_classes[i].attrib['value'] != 'N/A':
_ticket_class = ticket_classes[i].attrib['value']
_ticket_price = ticket_prices[i].attrib['value']
movie_data = {}
movie_data = {
'theater_id': _id,
'theater_address': _address,
'theater_chain': _chain,
'theater_city': _city,
'theater_latitude': _latitude,
'theater_longitude': _longitude,
'theater_name': _name,
'theater_name_slug': _name_slug,
'theater_url': _url,
'theater_state': _state,
'theater_zip': _zip,
'movie_id': _movie_id,
'name': _movie_name,
'format': _movie_format,
'time': _movie_time,
'ticketing_url': _movie_ticketing_url,
'ticket_class': _ticket_class,
'ticket_price': _ticket_price,
}
save('movies', movie_data)
except ConnectTimeout:
continue
#if no pricing just save the rest of the data
else:
movie_data = {}
movie_data = {
'theater_id': _id,
'theater_address': _address,
'theater_chain': _chain,
'theater_city': _city,
'theater_latitude': _latitude,
'theater_longitude': _longitude,
'theater_name': _name,
'theater_name_slug': _name_slug,
'theater_url': _url,
'theater_state': _state,
'theater_zip': _zip,
'movie_id': _movie_id,
'name': _movie_name,
'format': _movie_format,
'time': None,
'ticketing_url': None,
'ticket_class': None,
'ticket_price': None,
}
save('movies', movie_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment