Last active
August 24, 2020 04:32
-
-
Save scrapehero/bc34513e2ea72dc0890ad47fbd8a1a4f to your computer and use it in GitHub Desktop.
Python 3 code to extract the flight schedules and prices for a source and destination pair.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import requests | |
from lxml import html | |
from collections import OrderedDict | |
import argparse | |
def parse(source,destination,date): | |
for i in range(5): | |
try: | |
url = "https://www.expedia.com/Flights-Search?trip=oneway&leg1=from:{0},to:{1},departure:{2}TANYT&passengers=adults:1,children:0,seniors:0,infantinlap:Y&options=cabinclass%3Aeconomy&mode=search&origref=www.expedia.com".format(source,destination,date) | |
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'} | |
response = requests.get(url, headers=headers, verify=False) | |
parser = html.fromstring(response.text) | |
json_data_xpath = parser.xpath("//script[@id='cachedResultsJson']//text()") | |
raw_json =json.loads(json_data_xpath[0] if json_data_xpath else '') | |
flight_data = json.loads(raw_json["content"]) | |
flight_info = OrderedDict() | |
lists=[] | |
for i in flight_data['legs'].keys(): | |
total_distance = flight_data['legs'][i].get("formattedDistance",'') | |
exact_price = flight_data['legs'][i].get('price',{}).get('totalPriceAsDecimal','') | |
departure_location_airport = flight_data['legs'][i].get('departureLocation',{}).get('airportLongName','') | |
departure_location_city = flight_data['legs'][i].get('departureLocation',{}).get('airportCity','') | |
departure_location_airport_code = flight_data['legs'][i].get('departureLocation',{}).get('airportCode','') | |
arrival_location_airport = flight_data['legs'][i].get('arrivalLocation',{}).get('airportLongName','') | |
arrival_location_airport_code = flight_data['legs'][i].get('arrivalLocation',{}).get('airportCode','') | |
arrival_location_city = flight_data['legs'][i].get('arrivalLocation',{}).get('airportCity','') | |
airline_name = flight_data['legs'][i].get('carrierSummary',{}).get('airlineName','') | |
no_of_stops = flight_data['legs'][i].get("stops","") | |
flight_duration = flight_data['legs'][i].get('duration',{}) | |
flight_hour = flight_duration.get('hours','') | |
flight_minutes = flight_duration.get('minutes','') | |
flight_days = flight_duration.get('numOfDays','') | |
if no_of_stops==0: | |
stop = "Nonstop" | |
else: | |
stop = str(no_of_stops)+' Stop' | |
total_flight_duration = "{0} days {1} hours {2} minutes".format(flight_days,flight_hour,flight_minutes) | |
departure = departure_location_airport+", "+departure_location_city | |
arrival = arrival_location_airport+", "+arrival_location_city | |
carrier = flight_data['legs'][i].get('timeline',[])[0].get('carrier',{}) | |
plane = carrier.get('plane','') | |
plane_code = carrier.get('planeCode','') | |
formatted_price = "{0:.2f}".format(exact_price) | |
if not airline_name: | |
airline_name = carrier.get('operatedBy','') | |
timings = [] | |
for timeline in flight_data['legs'][i].get('timeline',{}): | |
if 'departureAirport' in timeline.keys(): | |
departure_airport = timeline['departureAirport'].get('longName','') | |
departure_time = timeline['departureTime'].get('time','') | |
arrival_airport = timeline.get('arrivalAirport',{}).get('longName','') | |
arrival_time = timeline.get('arrivalTime',{}).get('time','') | |
flight_timing = { | |
'departure_airport':departure_airport, | |
'departure_time':departure_time, | |
'arrival_airport':arrival_airport, | |
'arrival_time':arrival_time | |
} | |
timings.append(flight_timing) | |
flight_info={'stops':stop, | |
'ticket price':formatted_price, | |
'departure':departure, | |
'arrival':arrival, | |
'flight duration':total_flight_duration, | |
'airline':airline_name, | |
'plane':plane, | |
'timings':timings, | |
'plane code':plane_code | |
} | |
lists.append(flight_info) | |
sortedlist = sorted(lists, key=lambda k: k['ticket price'],reverse=False) | |
return sortedlist | |
except ValueError: | |
print ("Rerying...") | |
return {"error":"failed to process the page",} | |
if __name__=="__main__": | |
argparser = argparse.ArgumentParser() | |
argparser.add_argument('source',help = 'Source airport code') | |
argparser.add_argument('destination',help = 'Destination airport code') | |
argparser.add_argument('date',help = 'MM/DD/YYYY') | |
args = argparser.parse_args() | |
source = args.source | |
destination = args.destination | |
date = args.date | |
print ("Fetching flight details") | |
scraped_data = parse(source,destination,date) | |
print ("Writing data to output file") | |
with open('%s-%s-flight-results.json'%(source,destination),'w') as fp: | |
json.dump(scraped_data,fp,indent = 4) |
Hi,
The code works well, but doesn't srape data for all the flights that the website shows. For instance, if I run the following command:
python expedia.py nyc mia 11/08/2019
the last result the scraper returns is a delta flight (worth $203 at the time of writing). The scripts generates the following URL based on the inputs:
As you can see, the website has twice the number of flights returned by the script. Am I missing something here? I would like to get data for all flights on a particular route.
Thanks for your help.
I made a go version of this here https://github.com/4d55397500/hack-flight-search
does not work anymore, the initial search url returns no json data somhow
Same here too. No json data returned. Is there any workaround?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@pisaller - make sure that you're comparing "apples to apples", i.e. the website by default is using following values while performing search:
while the script is using: