Skip to content

Instantly share code, notes, and snippets.

@scrapehero
Last active September 11, 2019 16:46
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 5 You must be signed in to fork a gist
  • Save scrapehero/1b26ad7fd8db1023defa1f4afd49bdbb to your computer and use it in GitHub Desktop.
Save scrapehero/1b26ad7fd8db1023defa1f4afd49bdbb to your computer and use it in GitHub Desktop.
Python 3 Code to extrtact details of hotels from tripadvisor
# -*- coding: utf-8 -*-
from lxml import html
import requests
from collections import OrderedDict
import json
import argparse
import re
import sys
# Adjust MAX_RETRY according to the blocking from tripadvisor
MAX_RETRY = 10
RETRY = 0
def clean(text):
if text:
# Removing \n \r and \t
return ' '.join(''.join(text).split()).strip()
return None
def process_request(url, retry=0):
"""
Function to process tripadvisor Hotel page
Args:
: required param url : url of tripadvisor
return :
: parser object
"""
print("Fetching %s, retry count %s" % (url, retry))
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7",
"cache-control": "max-age=0",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36",
}
response = requests.get(url, headers=headers)
if response.status_code == 404:
return {'error': 'Page not found', 'status_code': 404}
parser = html.fromstring(response.text, url)
return process_page(parser, url)
def process_page(parser, url):
global MAX_RETRY
global RETRY
script_text = ' '.join(''.join(parser.xpath('//script//text()')).split())
raw_json = re.findall("define\(\'@ta\/page\-manifest\'\,\[\]\,function\(\)\{return\s+({.*?});\}\);", script_text)
try:
json_loaded = json.loads(raw_json[0])
except Exception as e:
json_loaded = {}
if RETRY < MAX_RETRY:
RETRY = RETRY+1
# Retrying the same URL
process_request(url, RETRY)
XPATH_NAME = '//h1[@id="HEADING"]//text()'
XPATH_RANK = '//span[contains(@class,"popularity")]//text()'
XPATH_AMENITIES = "//div[contains(text(),'HOTEL FEATURES')]/following-sibling::div//div[@class='textitem']//text()"
XPATH_HIGHLIGHTS = '//div[contains(@class,"HighlightedAmenities__amenityItem")]/text()'
XPATH_OFFICIAL_DESCRIPTION = '//div[contains(text(),"Description")]/following-sibling::div//span[contains(@class,"introText")]/text()'
XPATH_ADDITIONAL_INFO = "//div[@class='section_content']//div[@class='sub_title']"
XPATH_FULL_ADDRESS_JSON = '//script[@type="application/ld+json"]//text()'
raw_name = parser.xpath(XPATH_NAME)
raw_rank = parser.xpath(XPATH_RANK)
amenities = parser.xpath(XPATH_AMENITIES)
raw_highlights = parser.xpath(XPATH_HIGHLIGHTS)
raw_official_description = parser.xpath(XPATH_OFFICIAL_DESCRIPTION)
raw_additional_info = parser.xpath(XPATH_ADDITIONAL_INFO)
raw_address_json = parser.xpath(XPATH_FULL_ADDRESS_JSON)
name = clean(raw_name)
rank = clean(raw_rank)
if not name:
if RETRY < MAX_RETRY:
RETRY = RETRY+1
# Retrying the same URL
process_request(url, RETRY)
official_description = clean(raw_official_description)
cleaned_highlights = filter(lambda x: x != '\n', raw_highlights)
hotel_rating = 0
address = {}
if raw_address_json:
try:
parsed_address_info = json.loads(raw_address_json[0])
rating = parsed_address_info.get('aggregateRating', {})
address = parsed_address_info.get("address", {})
hotel_rating = rating.get('ratingValue')
review_count = rating.get('reviewCount')
address = {
'street_address': address.get('streetAddress'),
'region': address.get('addressRegion'),
'locality': address.get('addressLocality'),
'country': address.get("addressCountry", {}).get("name"),
'zipcode': address.get("postalCode")
}
except Exception as e:
review_count = hotel_rating = 0
raise e
highlights = ','.join(cleaned_highlights).replace('\n', '')
ratings = {}
if json_loaded:
redux_response = json_loaded['redux']['api']['responses']
if redux_response:
for url_ in redux_response.keys():
if '/data/1.0/location/' in url_:
rating_histogram = redux_response.get(url_).get('data').get('rating_histogram', {})
ratings = {
'Excellent': int(rating_histogram.get('count_5', 0)),
'Good': int(rating_histogram.get('count_4', 0)),
'Average': int(rating_histogram.get('count_3', 0)),
'Poor': int(rating_histogram.get('count_2', 0)),
'Terrible': int(rating_histogram.get('count_1', 0))
}
amenity_dict = {'Hotel Amenities': ','.join(amenities)}
additional_info_dict = OrderedDict()
for info in raw_additional_info:
XPATH_INFO_TEXT = ".//text()"
if info.xpath(XPATH_INFO_TEXT):
XPATH_INFO_KEY = ".//text()"
XPATH_INFO_VALUE = './/following-sibling::div[@class="sub_content"][1]//text()'
raw_info_key = info.xpath(XPATH_INFO_KEY)
raw_info_value = info.xpath(XPATH_INFO_VALUE)
if raw_info_value and raw_info_key:
# cleaning
raw_info_value = clean(raw_info_value)
# contact information such as website and email address are encoded by tripadvisor
if not raw_info_key[0] == 'Contact Information':
additional_info_dict.update({raw_info_key[0]: raw_info_value})
data = {
'address': address,
'ratings': ratings,
'amenities': amenity_dict,
'official_description': official_description,
'additional_info': additional_info_dict,
'rating': float(hotel_rating) if hotel_rating else 0.0,
'review_count': int(review_count) if review_count else 0,
'name': name,
'rank': rank,
'highlights': highlights,
'hotel_url': url
}
return data
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('url', help='Tripadvisor hotel url')
args = parser.parse_args()
url = args.url
scraped_data = process_request(url)
if scraped_data:
print("Writing scraped data")
with open('tripadvisor_hotel_scraped_data.json', 'w') as f:
json.dump(scraped_data, f, indent=4, ensure_ascii=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment