Skip to content

Instantly share code, notes, and snippets.

@Dakkaron
Forked from scrapehero/amazon_reviews.py
Last active October 19, 2018 09:37
Show Gist options
  • Save Dakkaron/d67ea1fb93a96246f0be8d23bd1fd4c4 to your computer and use it in GitHub Desktop.
Save Dakkaron/d67ea1fb93a96246f0be8d23bd1fd4c4 to your computer and use it in GitHub Desktop.
Python 3 code to extract amazon reviews
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Written as part of https://www.scrapehero.com/how-to-scrape-amazon-product-reviews-using-python/
from lxml import html
import json
import requests
import json,re
from dateutil import parser as dateparser
from time import sleep
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import concurrent.futures
AMAZON_VERSION = 'com'
#Add your own ASINs here
ASIN = 'B010S9M3L6'
LOCALE = {
'de': {
'vine': 'Vine Kundenrezension eines kostenfreien Produkts',
'verified': 'Verifizierter Kauf'
},
'com': {
'vine': 'Vine Customer Review of Free Product',
'verified': 'Verified Purchase'
}
}
def ParseReviewPage(asin, pagenum, pagecount):
print("Parsing page "+str(pagenum)+"/"+str(pagecount))
amazon_url = 'http://www.amazon.'+AMAZON_VERSION+'/dp/product-reviews/'+asin+'/?pageNumber='+str(pagenum)
# Add some recent user agent to prevent amazon from blocking the request
# Find some chrome user agent strings here https://udger.com/resources/ua-list/browser-detail?browser=Chrome
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
for i in range(3):
page = requests.get(amazon_url,headers = headers,verify=False)
page_response = page.text
parser = html.fromstring(page_response)
XPATH_REVIEW_SECTION_1 = '//div[contains(@id,"reviews-summary")]'
XPATH_REVIEW_SECTION_2 = '//div[@data-hook="review"]'
reviews = parser.xpath(XPATH_REVIEW_SECTION_1)
if not reviews:
reviews = parser.xpath(XPATH_REVIEW_SECTION_2)
if reviews:
break
if not reviews:
raise ValueError('unable to find reviews in page')
reviews_list = []
#Parsing individual reviews
for review in reviews:
XPATH_RATING = './/i[@data-hook="review-star-rating"]//text()'
XPATH_REVIEW_HEADER = './/a[@data-hook="review-title"]//text()'
XPATH_REVIEW_POSTED_DATE = './/span[@data-hook="review-date"]//text()'
XPATH_REVIEW_TEXT = './/span[@data-hook="review-body"]//text()'
XPATH_REVIEW_COMMENTS = './/span[contains(@class,"review-comment-total")]//text()'
XPATH_AUTHOR = './/a[@data-hook="review-author"]//text()'
XPATH_BUYER = './/div[contains(@class,"review-format-strip")]//span//text()'
raw_review_author = review.xpath(XPATH_AUTHOR)
raw_review_rating = review.xpath(XPATH_RATING)
raw_review_header = review.xpath(XPATH_REVIEW_HEADER)
raw_review_posted_date = review.xpath(XPATH_REVIEW_POSTED_DATE)
raw_review_text = review.xpath(XPATH_REVIEW_TEXT)
raw_buyer = review.xpath(XPATH_BUYER)
#cleaning data
author = ' '.join(' '.join(raw_review_author).split())
review_rating = int(''.join(raw_review_rating)[0])
review_header = ' '.join(' '.join(raw_review_header).split())
isVerified = sum([x==LOCALE[AMAZON_VERSION]['verified'] for x in raw_buyer])
isVine = sum([x==LOCALE[AMAZON_VERSION]['vine'] for x in raw_buyer])
buyer = "vine" if isVine else ("verified" if isVerified else "unverified")
try:
review_posted_date = dateparser.parse(''.join(raw_review_posted_date)).strftime('%d %b %Y')
except:
review_posted_date = None
review_text = ' '.join(' '.join(raw_review_text).split())
raw_review_comments = review.xpath(XPATH_REVIEW_COMMENTS)
review_comments = ''.join(raw_review_comments)
review_comments = int(re.sub('[A-Za-z]','',review_comments).strip())
review_dict = {
'review_comment_count':review_comments,
'review_text':review_text,
'review_posted_date':review_posted_date,
'review_header':review_header,
'review_rating':review_rating,
'review_author':author,
'review_buyer':buyer,
}
reviews_list.append(review_dict)
return reviews_list
def ParseReviews(asin):
# for i in range(5):
# try:
#This script has only been tested with Amazon.com
amazon_url = 'http://www.amazon.'+AMAZON_VERSION+'/dp/product-reviews/'+asin
# Add some recent user agent to prevent amazon from blocking the request
# Find some chrome user agent strings here https://udger.com/resources/ua-list/browser-detail?browser=Chrome
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
page = requests.get(amazon_url,headers = headers,verify=False)
page_response = page.text
parser = html.fromstring(page_response)
XPATH_AGGREGATE = '//span[@id="acrCustomerReviewText"]'
XPATH_REVIEW_SECTION_1 = '//div[contains(@id,"reviews-summary")]'
XPATH_REVIEW_SECTION_2 = '//div[@data-hook="review"]'
XPATH_AGGREGATE_RATING = '//table[@id="histogramTable"]//tr'
XPATH_PRODUCT_NAME = '//h1//a[@data-hook="product-link"]//text()'
XPATH_PRODUCT_PRICE = '//div[contains(@class,"product-price-line")]//span[contains(@class,"a-color-price")]/text()'
raw_product_price = parser.xpath(XPATH_PRODUCT_PRICE)
product_price = ''.join(raw_product_price).replace(',','.').replace("\xa0"," ")
raw_product_name = parser.xpath(XPATH_PRODUCT_NAME)
product_name = ''.join(raw_product_name).strip()
total_ratings = parser.xpath(XPATH_AGGREGATE_RATING)
pagecount = max([int(re.sub("\D","",x)) for x in parser.xpath('//li[@data-reftag="cm_cr_arp_d_paging_btm"]//a/text()')])
ratings_dict = {
"total": {
"all": {i:0 for i in range(1,6)},
"vine": {i:0 for i in range(1,6)},
"verified": {i:0 for i in range(1,6)},
"unverified": {i:0 for i in range(1,6)},
},
"percent": {
"all": {i:"" for i in range(1,6)},
"vine": {i:"" for i in range(1,6)},
"verified": {i:"" for i in range(1,6)},
"unverified": {i:"" for i in range(1,6)},
},
"average": {
"all": "",
"vine": "",
"verified": "",
"unverified": "",
},
}
with concurrent.futures.ThreadPoolExecutor(max_workers=min(pagecount, 500)) as executor:
futures = [ executor.submit(ParseReviewPage, asin, i, pagecount) for i in range(1,pagecount+1) ]
reviews_list = sum([ x.result() for x in concurrent.futures.as_completed(futures) ],[])
#import pdb;pdb.set_trace()
#reviews_list = sum([ParseReviewPage(asin, i, pagecount) for i in range(1,pagecount+1)],[])
for review in reviews_list:
rating = review["review_rating"]
buyer = review["review_buyer"]
ratings_dict["total"]["all"][rating] += 1
ratings_dict["total"][buyer][rating] += 1
for buyer in ratings_dict["total"]:
buyer_total = max(sum(ratings_dict["total"][buyer].values()), 1)
for i in range(1,6):
ratings_dict["percent"][buyer][i] = str(round(ratings_dict["total"][buyer][i]*100 / buyer_total))+"%"
tmp=[ratings_dict["total"][buyer][i]*i for i in range(1,6)]
ratings_dict["average"][buyer] = "%0.4f*" % (sum([ratings_dict["total"][buyer][i]*i for i in range(1,6)])/buyer_total)
print(json.dumps(ratings_dict,indent=4))
data = {
'ratings':ratings_dict,
'reviews':reviews_list,
'url':amazon_url,
'price':product_price,
'name':product_name,
}
return data
# except ValueError:
# print("Retrying to get the correct response")
# return {"error":"failed to process the page","asin":asin}
def ReadAsin(asin):
print("Downloading and processing page http://www.amazon."+AMAZON_VERSION+"/dp/"+asin)
extracted_data = ParseReviews(asin)
f = open('data.json','w')
json.dump(extracted_data,f,indent=4)
if __name__ == '__main__':
ReadAsin(ASIN)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment