Last active
September 11, 2019 16:46
-
-
Save scrapehero/1b26ad7fd8db1023defa1f4afd49bdbb to your computer and use it in GitHub Desktop.
Python 3 Code to extrtact details of hotels from tripadvisor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from lxml import html | |
import requests | |
from collections import OrderedDict | |
import json | |
import argparse | |
import re | |
import sys | |
# Adjust MAX_RETRY according to the blocking from tripadvisor | |
MAX_RETRY = 10 | |
RETRY = 0 | |
def clean(text): | |
if text: | |
# Removing \n \r and \t | |
return ' '.join(''.join(text).split()).strip() | |
return None | |
def process_request(url, retry=0): | |
""" | |
Function to process tripadvisor Hotel page | |
Args: | |
: required param url : url of tripadvisor | |
return : | |
: parser object | |
""" | |
print("Fetching %s, retry count %s" % (url, retry)) | |
headers = { | |
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", | |
"accept-encoding": "gzip, deflate, br", | |
"accept-language": "en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7", | |
"cache-control": "max-age=0", | |
"upgrade-insecure-requests": "1", | |
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36", | |
} | |
response = requests.get(url, headers=headers) | |
if response.status_code == 404: | |
return {'error': 'Page not found', 'status_code': 404} | |
parser = html.fromstring(response.text, url) | |
return process_page(parser, url) | |
def process_page(parser, url): | |
global MAX_RETRY | |
global RETRY | |
script_text = ' '.join(''.join(parser.xpath('//script//text()')).split()) | |
raw_json = re.findall("define\(\'@ta\/page\-manifest\'\,\[\]\,function\(\)\{return\s+({.*?});\}\);", script_text) | |
try: | |
json_loaded = json.loads(raw_json[0]) | |
except Exception as e: | |
json_loaded = {} | |
if RETRY < MAX_RETRY: | |
RETRY = RETRY+1 | |
# Retrying the same URL | |
process_request(url, RETRY) | |
XPATH_NAME = '//h1[@id="HEADING"]//text()' | |
XPATH_RANK = '//span[contains(@class,"popularity")]//text()' | |
XPATH_AMENITIES = "//div[contains(text(),'HOTEL FEATURES')]/following-sibling::div//div[@class='textitem']//text()" | |
XPATH_HIGHLIGHTS = '//div[contains(@class,"HighlightedAmenities__amenityItem")]/text()' | |
XPATH_OFFICIAL_DESCRIPTION = '//div[contains(text(),"Description")]/following-sibling::div//span[contains(@class,"introText")]/text()' | |
XPATH_ADDITIONAL_INFO = "//div[@class='section_content']//div[@class='sub_title']" | |
XPATH_FULL_ADDRESS_JSON = '//script[@type="application/ld+json"]//text()' | |
raw_name = parser.xpath(XPATH_NAME) | |
raw_rank = parser.xpath(XPATH_RANK) | |
amenities = parser.xpath(XPATH_AMENITIES) | |
raw_highlights = parser.xpath(XPATH_HIGHLIGHTS) | |
raw_official_description = parser.xpath(XPATH_OFFICIAL_DESCRIPTION) | |
raw_additional_info = parser.xpath(XPATH_ADDITIONAL_INFO) | |
raw_address_json = parser.xpath(XPATH_FULL_ADDRESS_JSON) | |
name = clean(raw_name) | |
rank = clean(raw_rank) | |
if not name: | |
if RETRY < MAX_RETRY: | |
RETRY = RETRY+1 | |
# Retrying the same URL | |
process_request(url, RETRY) | |
official_description = clean(raw_official_description) | |
cleaned_highlights = filter(lambda x: x != '\n', raw_highlights) | |
hotel_rating = 0 | |
address = {} | |
if raw_address_json: | |
try: | |
parsed_address_info = json.loads(raw_address_json[0]) | |
rating = parsed_address_info.get('aggregateRating', {}) | |
address = parsed_address_info.get("address", {}) | |
hotel_rating = rating.get('ratingValue') | |
review_count = rating.get('reviewCount') | |
address = { | |
'street_address': address.get('streetAddress'), | |
'region': address.get('addressRegion'), | |
'locality': address.get('addressLocality'), | |
'country': address.get("addressCountry", {}).get("name"), | |
'zipcode': address.get("postalCode") | |
} | |
except Exception as e: | |
review_count = hotel_rating = 0 | |
raise e | |
highlights = ','.join(cleaned_highlights).replace('\n', '') | |
ratings = {} | |
if json_loaded: | |
redux_response = json_loaded['redux']['api']['responses'] | |
if redux_response: | |
for url_ in redux_response.keys(): | |
if '/data/1.0/location/' in url_: | |
rating_histogram = redux_response.get(url_).get('data').get('rating_histogram', {}) | |
ratings = { | |
'Excellent': int(rating_histogram.get('count_5', 0)), | |
'Good': int(rating_histogram.get('count_4', 0)), | |
'Average': int(rating_histogram.get('count_3', 0)), | |
'Poor': int(rating_histogram.get('count_2', 0)), | |
'Terrible': int(rating_histogram.get('count_1', 0)) | |
} | |
amenity_dict = {'Hotel Amenities': ','.join(amenities)} | |
additional_info_dict = OrderedDict() | |
for info in raw_additional_info: | |
XPATH_INFO_TEXT = ".//text()" | |
if info.xpath(XPATH_INFO_TEXT): | |
XPATH_INFO_KEY = ".//text()" | |
XPATH_INFO_VALUE = './/following-sibling::div[@class="sub_content"][1]//text()' | |
raw_info_key = info.xpath(XPATH_INFO_KEY) | |
raw_info_value = info.xpath(XPATH_INFO_VALUE) | |
if raw_info_value and raw_info_key: | |
# cleaning | |
raw_info_value = clean(raw_info_value) | |
# contact information such as website and email address are encoded by tripadvisor | |
if not raw_info_key[0] == 'Contact Information': | |
additional_info_dict.update({raw_info_key[0]: raw_info_value}) | |
data = { | |
'address': address, | |
'ratings': ratings, | |
'amenities': amenity_dict, | |
'official_description': official_description, | |
'additional_info': additional_info_dict, | |
'rating': float(hotel_rating) if hotel_rating else 0.0, | |
'review_count': int(review_count) if review_count else 0, | |
'name': name, | |
'rank': rank, | |
'highlights': highlights, | |
'hotel_url': url | |
} | |
return data | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('url', help='Tripadvisor hotel url') | |
args = parser.parse_args() | |
url = args.url | |
scraped_data = process_request(url) | |
if scraped_data: | |
print("Writing scraped data") | |
with open('tripadvisor_hotel_scraped_data.json', 'w') as f: | |
json.dump(scraped_data, f, indent=4, ensure_ascii=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment