-
-
Save Dakkaron/d67ea1fb93a96246f0be8d23bd1fd4c4 to your computer and use it in GitHub Desktop.
Python 3 code to extract amazon reviews
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# Written as part of https://www.scrapehero.com/how-to-scrape-amazon-product-reviews-using-python/ | |
from lxml import html | |
import json | |
import requests | |
import json,re | |
from dateutil import parser as dateparser | |
from time import sleep | |
import urllib3 | |
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | |
import concurrent.futures | |
AMAZON_VERSION = 'com' | |
#Add your own ASINs here | |
ASIN = 'B010S9M3L6' | |
LOCALE = { | |
'de': { | |
'vine': 'Vine Kundenrezension eines kostenfreien Produkts', | |
'verified': 'Verifizierter Kauf' | |
}, | |
'com': { | |
'vine': 'Vine Customer Review of Free Product', | |
'verified': 'Verified Purchase' | |
} | |
} | |
def ParseReviewPage(asin, pagenum, pagecount): | |
print("Parsing page "+str(pagenum)+"/"+str(pagecount)) | |
amazon_url = 'http://www.amazon.'+AMAZON_VERSION+'/dp/product-reviews/'+asin+'/?pageNumber='+str(pagenum) | |
# Add some recent user agent to prevent amazon from blocking the request | |
# Find some chrome user agent strings here https://udger.com/resources/ua-list/browser-detail?browser=Chrome | |
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'} | |
for i in range(3): | |
page = requests.get(amazon_url,headers = headers,verify=False) | |
page_response = page.text | |
parser = html.fromstring(page_response) | |
XPATH_REVIEW_SECTION_1 = '//div[contains(@id,"reviews-summary")]' | |
XPATH_REVIEW_SECTION_2 = '//div[@data-hook="review"]' | |
reviews = parser.xpath(XPATH_REVIEW_SECTION_1) | |
if not reviews: | |
reviews = parser.xpath(XPATH_REVIEW_SECTION_2) | |
if reviews: | |
break | |
if not reviews: | |
raise ValueError('unable to find reviews in page') | |
reviews_list = [] | |
#Parsing individual reviews | |
for review in reviews: | |
XPATH_RATING = './/i[@data-hook="review-star-rating"]//text()' | |
XPATH_REVIEW_HEADER = './/a[@data-hook="review-title"]//text()' | |
XPATH_REVIEW_POSTED_DATE = './/span[@data-hook="review-date"]//text()' | |
XPATH_REVIEW_TEXT = './/span[@data-hook="review-body"]//text()' | |
XPATH_REVIEW_COMMENTS = './/span[contains(@class,"review-comment-total")]//text()' | |
XPATH_AUTHOR = './/a[@data-hook="review-author"]//text()' | |
XPATH_BUYER = './/div[contains(@class,"review-format-strip")]//span//text()' | |
raw_review_author = review.xpath(XPATH_AUTHOR) | |
raw_review_rating = review.xpath(XPATH_RATING) | |
raw_review_header = review.xpath(XPATH_REVIEW_HEADER) | |
raw_review_posted_date = review.xpath(XPATH_REVIEW_POSTED_DATE) | |
raw_review_text = review.xpath(XPATH_REVIEW_TEXT) | |
raw_buyer = review.xpath(XPATH_BUYER) | |
#cleaning data | |
author = ' '.join(' '.join(raw_review_author).split()) | |
review_rating = int(''.join(raw_review_rating)[0]) | |
review_header = ' '.join(' '.join(raw_review_header).split()) | |
isVerified = sum([x==LOCALE[AMAZON_VERSION]['verified'] for x in raw_buyer]) | |
isVine = sum([x==LOCALE[AMAZON_VERSION]['vine'] for x in raw_buyer]) | |
buyer = "vine" if isVine else ("verified" if isVerified else "unverified") | |
try: | |
review_posted_date = dateparser.parse(''.join(raw_review_posted_date)).strftime('%d %b %Y') | |
except: | |
review_posted_date = None | |
review_text = ' '.join(' '.join(raw_review_text).split()) | |
raw_review_comments = review.xpath(XPATH_REVIEW_COMMENTS) | |
review_comments = ''.join(raw_review_comments) | |
review_comments = int(re.sub('[A-Za-z]','',review_comments).strip()) | |
review_dict = { | |
'review_comment_count':review_comments, | |
'review_text':review_text, | |
'review_posted_date':review_posted_date, | |
'review_header':review_header, | |
'review_rating':review_rating, | |
'review_author':author, | |
'review_buyer':buyer, | |
} | |
reviews_list.append(review_dict) | |
return reviews_list | |
def ParseReviews(asin): | |
# for i in range(5): | |
# try: | |
#This script has only been tested with Amazon.com | |
amazon_url = 'http://www.amazon.'+AMAZON_VERSION+'/dp/product-reviews/'+asin | |
# Add some recent user agent to prevent amazon from blocking the request | |
# Find some chrome user agent strings here https://udger.com/resources/ua-list/browser-detail?browser=Chrome | |
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'} | |
page = requests.get(amazon_url,headers = headers,verify=False) | |
page_response = page.text | |
parser = html.fromstring(page_response) | |
XPATH_AGGREGATE = '//span[@id="acrCustomerReviewText"]' | |
XPATH_REVIEW_SECTION_1 = '//div[contains(@id,"reviews-summary")]' | |
XPATH_REVIEW_SECTION_2 = '//div[@data-hook="review"]' | |
XPATH_AGGREGATE_RATING = '//table[@id="histogramTable"]//tr' | |
XPATH_PRODUCT_NAME = '//h1//a[@data-hook="product-link"]//text()' | |
XPATH_PRODUCT_PRICE = '//div[contains(@class,"product-price-line")]//span[contains(@class,"a-color-price")]/text()' | |
raw_product_price = parser.xpath(XPATH_PRODUCT_PRICE) | |
product_price = ''.join(raw_product_price).replace(',','.').replace("\xa0"," ") | |
raw_product_name = parser.xpath(XPATH_PRODUCT_NAME) | |
product_name = ''.join(raw_product_name).strip() | |
total_ratings = parser.xpath(XPATH_AGGREGATE_RATING) | |
pagecount = max([int(re.sub("\D","",x)) for x in parser.xpath('//li[@data-reftag="cm_cr_arp_d_paging_btm"]//a/text()')]) | |
ratings_dict = { | |
"total": { | |
"all": {i:0 for i in range(1,6)}, | |
"vine": {i:0 for i in range(1,6)}, | |
"verified": {i:0 for i in range(1,6)}, | |
"unverified": {i:0 for i in range(1,6)}, | |
}, | |
"percent": { | |
"all": {i:"" for i in range(1,6)}, | |
"vine": {i:"" for i in range(1,6)}, | |
"verified": {i:"" for i in range(1,6)}, | |
"unverified": {i:"" for i in range(1,6)}, | |
}, | |
"average": { | |
"all": "", | |
"vine": "", | |
"verified": "", | |
"unverified": "", | |
}, | |
} | |
with concurrent.futures.ThreadPoolExecutor(max_workers=min(pagecount, 500)) as executor: | |
futures = [ executor.submit(ParseReviewPage, asin, i, pagecount) for i in range(1,pagecount+1) ] | |
reviews_list = sum([ x.result() for x in concurrent.futures.as_completed(futures) ],[]) | |
#import pdb;pdb.set_trace() | |
#reviews_list = sum([ParseReviewPage(asin, i, pagecount) for i in range(1,pagecount+1)],[]) | |
for review in reviews_list: | |
rating = review["review_rating"] | |
buyer = review["review_buyer"] | |
ratings_dict["total"]["all"][rating] += 1 | |
ratings_dict["total"][buyer][rating] += 1 | |
for buyer in ratings_dict["total"]: | |
buyer_total = max(sum(ratings_dict["total"][buyer].values()), 1) | |
for i in range(1,6): | |
ratings_dict["percent"][buyer][i] = str(round(ratings_dict["total"][buyer][i]*100 / buyer_total))+"%" | |
tmp=[ratings_dict["total"][buyer][i]*i for i in range(1,6)] | |
ratings_dict["average"][buyer] = "%0.4f*" % (sum([ratings_dict["total"][buyer][i]*i for i in range(1,6)])/buyer_total) | |
print(json.dumps(ratings_dict,indent=4)) | |
data = { | |
'ratings':ratings_dict, | |
'reviews':reviews_list, | |
'url':amazon_url, | |
'price':product_price, | |
'name':product_name, | |
} | |
return data | |
# except ValueError: | |
# print("Retrying to get the correct response") | |
# return {"error":"failed to process the page","asin":asin} | |
def ReadAsin(asin): | |
print("Downloading and processing page http://www.amazon."+AMAZON_VERSION+"/dp/"+asin) | |
extracted_data = ParseReviews(asin) | |
f = open('data.json','w') | |
json.dump(extracted_data,f,indent=4) | |
if __name__ == '__main__': | |
ReadAsin(ASIN) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment