Skip to content

Instantly share code, notes, and snippets.

Last active February 12, 2018 07:48
Show Gist options
  • Save scrapehero/b66b11155d6afb401685271d82c623f6 to your computer and use it in GitHub Desktop.
Save scrapehero/b66b11155d6afb401685271d82c623f6 to your computer and use it in GitHub Desktop.
Python 2 code to extract movie details from
from lxml import html, etree
import datetime
import requests
import re
import os
import sys
import unicodecsv as csv
import argparse
import json
from exceptions import ValueError
def parse(location, showdate):
print "Fetching Locations.."
searchedLocation = location
searchedDate = showdate
movie_listings = []
# Cookies for searching theater location
cookie = {
'akamai_generated_location': '{"zip":"""","city":"CLIFTON","state":"NJ","county":"PASSAIC","areacode":"""","lat":"40.8800","long":"-74.1446","countrycode":""""}'
# Headers to get location details from their auto complete query
location_headers = {
'referer': '',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
# Location autocomplete API endpoint
location_url = '' + searchedLocation
data = {
'zipCode': '',
'city': '',
'state': '',
'date': str(searchedDate),
'page': 1,
'favTheaterOnly': False,
'limit': 30,
'offset': 0,
'isdesktop': True
# Retrieving available locations
location_response = requests.get(location_url, cookies=cookie, headers=location_headers).json()
locations = location_response.get('resultsByType',{}).get('locations',{}).get('items',{})
if locations:
# Selecting first location from available locations
searched_location = locations[0]
searched_location_url = searched_location.get('link')
location_name = searched_location.get('name')
state = searched_location.get('state')
# Getting city from location name, city is necessary to get theater lists if you are passing location as input
city = location_name.split(',')[0].strip() if ',' in location_name else None
if city and state:
data['city'] = city
data['state'] = state
# city,state is not necessary if you are passing zipcode as input
data['zipCode'] = location_name
# Headers for getting theater listing for the searched location
theater_headers = {
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,ml;q=0.8',
'referer': searched_location_url,
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
movie_url = ''
# Fetching Movie details for search location
print "Fetching movie details"
movie_response = requests.get(movie_url, params=data, headers=location_headers).json()
print "Failed to get movie details"
all_theaters = movie_response.get('theaters')
if all_theaters:
# Iterating through each each theater
for theater in all_theaters:
theater_name = theater.get('name')
address = theater.get('address1')
city = theater.get('city')
state = theater.get('state')
zipcode = theater.get('zip')
theater_address = address + ' ' + city + ' ' + state + ' ' + zipcode
all_movies = theater.get('movies')
# Iterating through each movie in a thaater
if all_movies:
for movie in all_movies:
# cleaning data
movie_name = movie.get('title').strip()
duration = str(movie.get('runtime'))
genre = ','.join(' '.join(movie.get('genres')).split()).strip()
movie_rating = movie.get('rating')
star_rating = str(movie['stars']['totalRating']
movie_data = {
"Theatre_Name": theater_name,
"Theatre_Address": theater_address,
"Movie_Name": movie_name,
"Show_Date": searchedDate,
"Movie_Rating": movie_rating,
"Star_Rating": star_rating,
"Duration": duration,
"Genre": genre,
"Location_or_Zipcode": searchedLocation
print "No movies in %s"%(theater_name)
return movie_listings
print "No theaters found"
print "No location found"
if __name__ == "__main__":
''' eg-:python 20001 2017-12-31 '''
argparser = argparse.ArgumentParser()
argparser.add_argument('location', help='theater location (zipcode or city+state)', type=str)
argparser.add_argument('showdate', help='movie show time', type=str)
args = argparser.parse_args()
location = args.location
showdate = args.showdate
validdate = False
datetime.datetime.strptime(showdate, '%Y-%m-%d')
validdate =True
except ValueError:
print "Invalid showdate, showdate should be YYYY-MM-DD format"
if validdate:
searchdate = datetime.datetime.strptime(showdate, '%Y-%m-%d').date()
today = str('%Y-%m-%d'))
datenow = datetime.datetime.strptime(today,'%Y-%m-%d').date()
if searchdate >= datenow:
scraped_data = parse(location, showdate)
if scraped_data:
print "Writing data to output file"
with open('%s-%s-movie-results.csv' % (location, showdate), 'w')as csvfile:
fieldnames = ['Theatre_Name', 'Theatre_Address', 'Movie_Name',
'Show_Date', 'Location_or_Zipcode', 'Duration', 'Genre', 'Movie_Rating', 'Star_Rating']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
for data in scraped_data:
print "Your search for %s, in %s does not match any movies" % (location, showdate)
print "Entered date is already passed"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment