Skip to content

Instantly share code, notes, and snippets.

@jamieaitken
Created September 19, 2018 14:20
Show Gist options
  • Save jamieaitken/e6df15db9f1204fe67c16f1404996c6f to your computer and use it in GitHub Desktop.
Save jamieaitken/e6df15db9f1204fe67c16f1404996c6f to your computer and use it in GitHub Desktop.
def scrape(url):
urllib3.disable_warnings()
headers = {
"Accept": "text / javascript, text / html, application / xml, text / xml, * / * ",
"Accept - Encoding": "gzip, deflate, br",
"Accept - Language": "en - US, en;q = 0.5",
"Connection": "keep - alive",
"Cache - Control": "no - cache",
"Content - Type": "application / x - www - form - urlencoded;charset = utf - 8",
"Host": "www.tripadvisor.com",
"Pragma": "no - cache",
"Referer": "http: // www.tripadvisor.com",
"User - Agent": "Mozilla / 5.0(X11;Ubuntu;Linuxi686;rv: 28.0) Gecko / 20100101Firefox / 28.0",
"X - Requested - With": "XMLHttpRequest"
}
page_response = requests.get(url=url, headers=headers,
verify=True)
if page_response.status_code == 400:
return {
'status': 400,
'amountOfReviews': 0
}
parser = html.fromstring(page_response.text)
XPATH_REVIEWS = './/ul[@class="ratings_chart"]//text()'
XPATH_RATING = './/span[@class="overallRating"]//text()'
XPATH_AMOUNT_OF_REVIEWS = './/span[@property="count"]//text()'
XPATH_AUTHOR_NAME = './/span[@class="expand_inline scrname"]//text()'
XPATH_AUTHOR_REVIEW = '//p[@class="partial_entry"]/text()'
XPATH_AUTHOR_RATING = '//div[contains(@class, "prw_reviews_basic_review_hsx")]//span[contains(@class, "ui_bubble_rating")]/@class'
XPATH_REVIEW_LINK = '//div[contains(@class, "quote")]/a/@href'
raw_no_of_reviews = parser.xpath(XPATH_REVIEWS)
raw_rating = parser.xpath(XPATH_RATING)
raw_amount_reviews = parser.xpath(XPATH_AMOUNT_OF_REVIEWS)
raw_author_name = parser.xpath(XPATH_AUTHOR_NAME)
raw_author_review = parser.xpath(XPATH_AUTHOR_REVIEW)
raw_author_rating = parser.xpath(XPATH_AUTHOR_RATING)
raw_review_links = parser.xpath(XPATH_REVIEW_LINK)
keys = {
'Excellent': "fiveStar",
'Very good': "fourStar",
'Average': "threeStar",
'Poor': "twoStar",
'Terrible': "oneStar"
}
amount_reviews = int(raw_amount_reviews[0])
reviews = {}
aggregatedReviews = {}
for k, v in enumerate(raw_author_name):
reviews[k] = {
'authorName': v
}
for k, v in enumerate(raw_author_review):
reviews[k]['review'] = v
for k, v in enumerate(raw_no_of_reviews):
if v in keys.keys():
aggregatedReviews[keys.get(v)] = round(amount_reviews * (
float(raw_no_of_reviews[k + 1].strip("%")) / 100), 0)
for k, v in enumerate(raw_review_links):
reviews[k]['authorUrl'] = "https://www.tripadvisor.co.uk/" + v
for k, v in enumerate(raw_author_rating):
v = v.replace('ui_bubble_rating ', '').strip()
if v == 'bubble_10':
reviews[k]['rating'] = 1
elif v == 'bubble_20':
reviews[k]['rating'] = 2
elif v == 'bubble_30':
reviews[k]['rating'] = 3
elif v == 'bubble_40':
reviews[k]['rating'] = 4
elif v == 'bubble_50':
reviews[k]['rating'] = 5
rating = float("".join(raw_rating).replace("of 5 bubbles", "").strip()) if raw_rating else None
data = {
"status": 200,
"rating": rating,
"reviews": reviews,
"aggregatedReviews": aggregatedReviews,
"amountOfReviews": amount_reviews
}
return data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment