-
-
Save scrapehero/900419a768c5fac9ebdef4cb246b25cb to your computer and use it in GitHub Desktop.
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# Written as part of https://www.scrapehero.com/how-to-scrape-amazon-product-reviews-using-python/ | |
from lxml import html | |
from json import dump,loads | |
from requests import get | |
import json | |
from re import sub | |
from dateutil import parser as dateparser | |
from time import sleep | |
def ParseReviews(asin): | |
# This script has only been tested with Amazon.com | |
amazon_url = 'http://www.amazon.com/dp/'+asin | |
# Add some recent user agent to prevent amazon from blocking the request | |
# Find some chrome user agent strings here https://udger.com/resources/ua-list/browser-detail?browser=Chrome | |
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'} | |
for i in range(5): | |
response = get(amazon_url, headers = headers, verify=False, timeout=30) | |
if response.status_code == 404: | |
return {"url": amazon_url, "error": "page not found"} | |
if response.status_code != 200: | |
continue | |
# Removing the null bytes from the response. | |
cleaned_response = response.text.replace('\x00', '') | |
parser = html.fromstring(cleaned_response) | |
XPATH_AGGREGATE = '//span[@id="acrCustomerReviewText"]' | |
XPATH_REVIEW_SECTION_1 = '//div[contains(@id,"reviews-summary")]' | |
XPATH_REVIEW_SECTION_2 = '//div[@data-hook="review"]' | |
XPATH_AGGREGATE_RATING = '//table[@id="histogramTable"]//tr' | |
XPATH_PRODUCT_NAME = '//h1//span[@id="productTitle"]//text()' | |
XPATH_PRODUCT_PRICE = '//span[@id="priceblock_ourprice"]/text()' | |
raw_product_price = parser.xpath(XPATH_PRODUCT_PRICE) | |
raw_product_name = parser.xpath(XPATH_PRODUCT_NAME) | |
total_ratings = parser.xpath(XPATH_AGGREGATE_RATING) | |
reviews = parser.xpath(XPATH_REVIEW_SECTION_1) | |
product_price = ''.join(raw_product_price).replace(',', '') | |
product_name = ''.join(raw_product_name).strip() | |
if not reviews: | |
reviews = parser.xpath(XPATH_REVIEW_SECTION_2) | |
ratings_dict = {} | |
reviews_list = [] | |
# Grabing the rating section in product page | |
for ratings in total_ratings: | |
extracted_rating = ratings.xpath('./td//a//text()') | |
if extracted_rating: | |
rating_key = extracted_rating[0] | |
raw_raing_value = extracted_rating[1] | |
rating_value = raw_raing_value | |
if rating_key: | |
ratings_dict.update({rating_key: rating_value}) | |
# Parsing individual reviews | |
for review in reviews: | |
XPATH_RATING = './/i[@data-hook="review-star-rating"]//text()' | |
XPATH_REVIEW_HEADER = './/a[@data-hook="review-title"]//text()' | |
XPATH_REVIEW_POSTED_DATE = './/span[@data-hook="review-date"]//text()' | |
XPATH_REVIEW_TEXT_1 = './/div[@data-hook="review-collapsed"]//text()' | |
XPATH_REVIEW_TEXT_2 = './/div//span[@data-action="columnbalancing-showfullreview"]/@data-columnbalancing-showfullreview' | |
XPATH_REVIEW_COMMENTS = './/span[@data-hook="review-comment"]//text()' | |
XPATH_AUTHOR = './/span[contains(@class,"profile-name")]//text()' | |
XPATH_REVIEW_TEXT_3 = './/div[contains(@id,"dpReviews")]/div/text()' | |
raw_review_author = review.xpath(XPATH_AUTHOR) | |
raw_review_rating = review.xpath(XPATH_RATING) | |
raw_review_header = review.xpath(XPATH_REVIEW_HEADER) | |
raw_review_posted_date = review.xpath(XPATH_REVIEW_POSTED_DATE) | |
raw_review_text1 = review.xpath(XPATH_REVIEW_TEXT_1) | |
raw_review_text2 = review.xpath(XPATH_REVIEW_TEXT_2) | |
raw_review_text3 = review.xpath(XPATH_REVIEW_TEXT_3) | |
# Cleaning data | |
author = ' '.join(' '.join(raw_review_author).split()) | |
review_rating = ''.join(raw_review_rating).replace('out of 5 stars', '') | |
review_header = ' '.join(' '.join(raw_review_header).split()) | |
try: | |
review_posted_date = dateparser.parse(''.join(raw_review_posted_date)).strftime('%d %b %Y') | |
except: | |
review_posted_date = None | |
review_text = ' '.join(' '.join(raw_review_text1).split()) | |
# Grabbing hidden comments if present | |
if raw_review_text2: | |
json_loaded_review_data = loads(raw_review_text2[0]) | |
json_loaded_review_data_text = json_loaded_review_data['rest'] | |
cleaned_json_loaded_review_data_text = re.sub('<.*?>', '', json_loaded_review_data_text) | |
full_review_text = review_text+cleaned_json_loaded_review_data_text | |
else: | |
full_review_text = review_text | |
if not raw_review_text1: | |
full_review_text = ' '.join(' '.join(raw_review_text3).split()) | |
raw_review_comments = review.xpath(XPATH_REVIEW_COMMENTS) | |
review_comments = ''.join(raw_review_comments) | |
review_comments = sub('[A-Za-z]', '', review_comments).strip() | |
review_dict = { | |
'review_comment_count': review_comments, | |
'review_text': full_review_text, | |
'review_posted_date': review_posted_date, | |
'review_header': review_header, | |
'review_rating': review_rating, | |
'review_author': author | |
} | |
reviews_list.append(review_dict) | |
data = { | |
'ratings': ratings_dict, | |
'reviews': reviews_list, | |
'url': amazon_url, | |
'name': product_name, | |
'price': product_price | |
} | |
return data | |
return {"error": "failed to process the page", "url": amazon_url} | |
def ReadAsin(): | |
# Add your own ASINs here | |
AsinList = ['B01ETPUQ6E', 'B017HW9DEW', 'B00U8KSIOM'] | |
extracted_data = [] | |
for asin in AsinList: | |
print("Downloading and processing page http://www.amazon.com/dp/" + asin) | |
extracted_data.append(ParseReviews(asin)) | |
sleep(5) | |
f = open('data.json', 'w') | |
dump(extracted_data, f, indent=4) | |
f.close() | |
if __name__ == '__main__': | |
ReadAsin() |
https://github.com/DavidRoldan523/amazon_reviews_allpages
This code is a Script to scrape all reviews on all Amazon pages
https://github.com/DavidRoldan523/amazon_reviews_allpages
This code is a Script to scrape all reviews on all Amazon pages
Sorry but the link you mentioned is not found
Hi there, The link does work https://github.com/DavidRoldan523/amazon_reviews_allpages *My repository on github: * https://github.com/DavidRoldan523?tab=repositories El mar., 21 de may. de 2019 a la(s) 05:44, Hazem Samoaa ( notifications@github.com) escribió:
…
https://github.com/DavidRoldan523/amazon_reviews_allpages http://url This code is a Script to scrape all reviews on all Amazon pages @cjgalvisc96 https://github.com/cjgalvisc96 @DavidRoldan523 https://github.com/DavidRoldan523 Sorry but the link you mentioned is not found — You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://gist.github.com/900419a768c5fac9ebdef4cb246b25cb?email_source=notifications&email_token=AH7T6GHZ4EG6HTF4QJGGR73PWPHAFA5CNFSM4HIOGSUKYY3PNVWWK3TUL52HS4DFVNDWS43UINXW23LFNZ2KUY3PNVWWK3TUL5UWJTQAFSMQG#gistcomment-2922755, or mute the thread https://github.com/notifications/unsubscribe-auth/AH7T6GHK2XN4673RGNFFWKTPWPHAFANCNFSM4HIOGSUA .
Thanks a lot for response
is it possible to get products asin so I can look for the products that I prefer. and in case I want to collect reviews for specific brand based on specific category does your code work (Huawei as brand, electronics and accessories as category )
Hi David,
Thanks for sharing the code! There seems a small typo on line 71. Should it be "if int(data['number_reviews']) % 2 == 0" rather than "if number_page_reviews % 2 == 0"?
Another quick question is that there seems a maximum limit of 5,000 reviews that can be extracted. For example, for this Instant Pot pressure cooker (ASIN: B00FLYWNYQ), it has over 34k review, but I can only extract 5000 from it.
Thanks!
Hi David,
I was able to successfully apply your code to a few ASINs. However, isn't your code supposed to produce a json or csv file? I cannot find one anywhere after running the code. The only thing I changed were the ASINs listed and the code ran successfully. I figured I would find it in my jupyter notebooks 'Files' tab. Also looked in my downloaded files in Windows. No luck... Any advice on what I ought to do from here?
I am new write programs using Python. For python code i wants to scrape amazon data by UPC not ASIN. How can i do it? i need to scrape Title, Manufacturer,Brand name, Image, Item Weight, Price. Need help . Is there any Suggestion please?
from lxml import html
import csv,os,json
import requests
from exceptions import ValueError
from time import sleep
def AmzonParser(url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
page = requests.get(url,headers=headers)
while True:
sleep(3)
try:
doc = html.fromstring(page.content)
XPATH_NAME = '//h1[@id="title"]//text()'
XPATH_SALE_PRICE = '//span[contains(@id,"ourprice") or contains(@id,"saleprice")]/text()'
XPATH_ORIGINAL_PRICE = '//td[contains(text(),"List Price") or contains(text(),"M.R.P") or contains(text(),"Price")]/following-sibling::td/text()'
XPATH_CATEGORY = '//a[@Class="a-link-normal a-color-tertiary"]//text()'
XPATH_AVAILABILITY = '//div[@id="availability"]//text()'
def ReadAsin():
# AsinList = csv.DictReader(open(os.path.join(os.path.dirname(file),"Asinfeed.csv")))
AsinList = ['B0046UR4F4',
'B00JGTVU5A',
'B00GJYCIVK',
'B00EPGK7CQ',
'B00EPGKA4G',
'B00YW5DLB4',
'B00KGD0628',
'B00O9A48N2',
'B00O9A4MEW',
'B00UZKG8QU',]
extracted_data = []
for i in AsinList:
url = "http://www.amazon.com/dp/"+i
print "Processing: "+url
extracted_data.append(AmzonParser(url))
sleep(5)
f=open('data.json','w')
json.dump(extracted_data,f,indent=4)
if name == "main":
ReadAsin()