-
-
Save scrapehero/bc34513e2ea72dc0890ad47fbd8a1a4f to your computer and use it in GitHub Desktop.
import json | |
import requests | |
from lxml import html | |
from collections import OrderedDict | |
import argparse | |
def parse(source,destination,date): | |
for i in range(5): | |
try: | |
url = "https://www.expedia.com/Flights-Search?trip=oneway&leg1=from:{0},to:{1},departure:{2}TANYT&passengers=adults:1,children:0,seniors:0,infantinlap:Y&options=cabinclass%3Aeconomy&mode=search&origref=www.expedia.com".format(source,destination,date) | |
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'} | |
response = requests.get(url, headers=headers, verify=False) | |
parser = html.fromstring(response.text) | |
json_data_xpath = parser.xpath("//script[@id='cachedResultsJson']//text()") | |
raw_json =json.loads(json_data_xpath[0] if json_data_xpath else '') | |
flight_data = json.loads(raw_json["content"]) | |
flight_info = OrderedDict() | |
lists=[] | |
for i in flight_data['legs'].keys(): | |
total_distance = flight_data['legs'][i].get("formattedDistance",'') | |
exact_price = flight_data['legs'][i].get('price',{}).get('totalPriceAsDecimal','') | |
departure_location_airport = flight_data['legs'][i].get('departureLocation',{}).get('airportLongName','') | |
departure_location_city = flight_data['legs'][i].get('departureLocation',{}).get('airportCity','') | |
departure_location_airport_code = flight_data['legs'][i].get('departureLocation',{}).get('airportCode','') | |
arrival_location_airport = flight_data['legs'][i].get('arrivalLocation',{}).get('airportLongName','') | |
arrival_location_airport_code = flight_data['legs'][i].get('arrivalLocation',{}).get('airportCode','') | |
arrival_location_city = flight_data['legs'][i].get('arrivalLocation',{}).get('airportCity','') | |
airline_name = flight_data['legs'][i].get('carrierSummary',{}).get('airlineName','') | |
no_of_stops = flight_data['legs'][i].get("stops","") | |
flight_duration = flight_data['legs'][i].get('duration',{}) | |
flight_hour = flight_duration.get('hours','') | |
flight_minutes = flight_duration.get('minutes','') | |
flight_days = flight_duration.get('numOfDays','') | |
if no_of_stops==0: | |
stop = "Nonstop" | |
else: | |
stop = str(no_of_stops)+' Stop' | |
total_flight_duration = "{0} days {1} hours {2} minutes".format(flight_days,flight_hour,flight_minutes) | |
departure = departure_location_airport+", "+departure_location_city | |
arrival = arrival_location_airport+", "+arrival_location_city | |
carrier = flight_data['legs'][i].get('timeline',[])[0].get('carrier',{}) | |
plane = carrier.get('plane','') | |
plane_code = carrier.get('planeCode','') | |
formatted_price = "{0:.2f}".format(exact_price) | |
if not airline_name: | |
airline_name = carrier.get('operatedBy','') | |
timings = [] | |
for timeline in flight_data['legs'][i].get('timeline',{}): | |
if 'departureAirport' in timeline.keys(): | |
departure_airport = timeline['departureAirport'].get('longName','') | |
departure_time = timeline['departureTime'].get('time','') | |
arrival_airport = timeline.get('arrivalAirport',{}).get('longName','') | |
arrival_time = timeline.get('arrivalTime',{}).get('time','') | |
flight_timing = { | |
'departure_airport':departure_airport, | |
'departure_time':departure_time, | |
'arrival_airport':arrival_airport, | |
'arrival_time':arrival_time | |
} | |
timings.append(flight_timing) | |
flight_info={'stops':stop, | |
'ticket price':formatted_price, | |
'departure':departure, | |
'arrival':arrival, | |
'flight duration':total_flight_duration, | |
'airline':airline_name, | |
'plane':plane, | |
'timings':timings, | |
'plane code':plane_code | |
} | |
lists.append(flight_info) | |
sortedlist = sorted(lists, key=lambda k: k['ticket price'],reverse=False) | |
return sortedlist | |
except ValueError: | |
print ("Rerying...") | |
return {"error":"failed to process the page",} | |
if __name__=="__main__": | |
argparser = argparse.ArgumentParser() | |
argparser.add_argument('source',help = 'Source airport code') | |
argparser.add_argument('destination',help = 'Destination airport code') | |
argparser.add_argument('date',help = 'MM/DD/YYYY') | |
args = argparser.parse_args() | |
source = args.source | |
destination = args.destination | |
date = args.date | |
print ("Fetching flight details") | |
scraped_data = parse(source,destination,date) | |
print ("Writing data to output file") | |
with open('%s-%s-flight-results.json'%(source,destination),'w') as fp: | |
json.dump(scraped_data,fp,indent = 4) |
Hi, thank you for your code. I just found that the flights scraped form the xml is different from the actual website for example: PVG-LAX on 05/01/2019. There are 7 none-stop flights on the web but I can only got the first 2. Do you have any idea how this happened?
@pisaller - make sure that you're comparing "apples to apples", i.e. the website by default is using following values while performing search:
passengers=adults:1,children:0,seniors:0,infantinlap:Y
options=cabinclass:economy,nopenalty:N
while the script is using:
passengers=adults:1,children:0,seniors:0,infantinlap:Y
options=cabinclass:economy
Hi,
The code works well, but doesn't srape data for all the flights that the website shows. For instance, if I run the following command:
python expedia.py nyc mia 11/08/2019
the last result the scraper returns is a delta flight (worth $203 at the time of writing). The scripts generates the following URL based on the inputs:
As you can see, the website has twice the number of flights returned by the script. Am I missing something here? I would like to get data for all flights on a particular route.
Thanks for your help.
I made a go version of this here https://github.com/4d55397500/hack-flight-search
does not work anymore, the initial search url returns no json data somhow
Same here too. No json data returned. Is there any workaround?
Hi,
I am getting these errors.. is it because I am running Python 3.6.3?