Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Python Code to Scrape Customer Reviews from Amazon.com. Read more on https://www.scrapehero.com/how-to-scrape-amazon-product-reviews-using-python/
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Written as part of https://www.scrapehero.com/how-to-scrape-amazon-product-reviews-using-python/
from lxml import html
import json
import requests
import json,re
from dateutil import parser as dateparser
from time import sleep
def ParseReviews(asin):
# Added Retrying
for i in range(5):
try:
#This script has only been tested with Amazon.com
amazon_url = 'http://www.amazon.com/dp/'+asin
# Add some recent user agent to prevent amazon from blocking the request
# Find some chrome user agent strings here https://udger.com/resources/ua-list/browser-detail?browser=Chrome
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
page = requests.get(amazon_url,headers = headers)
page_response = page.text
parser = html.fromstring(page_response)
XPATH_AGGREGATE = '//span[@id="acrCustomerReviewText"]'
XPATH_REVIEW_SECTION_1 = '//div[contains(@id,"reviews-summary")]'
XPATH_REVIEW_SECTION_2 = '//div[@data-hook="review"]'
XPATH_AGGREGATE_RATING = '//table[@id="histogramTable"]//tr'
XPATH_PRODUCT_NAME = '//h1//span[@id="productTitle"]//text()'
XPATH_PRODUCT_PRICE = '//span[@id="priceblock_ourprice"]/text()'
raw_product_price = parser.xpath(XPATH_PRODUCT_PRICE)
product_price = ''.join(raw_product_price).replace(',','')
raw_product_name = parser.xpath(XPATH_PRODUCT_NAME)
product_name = ''.join(raw_product_name).strip()
total_ratings = parser.xpath(XPATH_AGGREGATE_RATING)
reviews = parser.xpath(XPATH_REVIEW_SECTION_1)
if not reviews:
reviews = parser.xpath(XPATH_REVIEW_SECTION_2)
ratings_dict = {}
reviews_list = []
if not reviews:
raise ValueError('unable to find reviews in page')
#grabing the rating section in product page
for ratings in total_ratings:
extracted_rating = ratings.xpath('./td//a//text()')
if extracted_rating:
rating_key = extracted_rating[0]
raw_raing_value = extracted_rating[1]
rating_value = raw_raing_value
if rating_key:
ratings_dict.update({rating_key:rating_value})
#Parsing individual reviews
for review in reviews:
XPATH_RATING = './/i[@data-hook="review-star-rating"]//text()'
XPATH_REVIEW_HEADER = './/a[@data-hook="review-title"]//text()'
XPATH_REVIEW_POSTED_DATE = './/a[contains(@href,"/profile/")]/parent::span/following-sibling::span/text()'
XPATH_REVIEW_TEXT_1 = './/div[@data-hook="review-collapsed"]//text()'
XPATH_REVIEW_TEXT_2 = './/div//span[@data-action="columnbalancing-showfullreview"]/@data-columnbalancing-showfullreview'
XPATH_REVIEW_COMMENTS = './/span[@data-hook="review-comment"]//text()'
XPATH_AUTHOR = './/a[contains(@href,"/profile/")]/parent::span//text()'
XPATH_REVIEW_TEXT_3 = './/div[contains(@id,"dpReviews")]/div/text()'
raw_review_author = review.xpath(XPATH_AUTHOR)
raw_review_rating = review.xpath(XPATH_RATING)
raw_review_header = review.xpath(XPATH_REVIEW_HEADER)
raw_review_posted_date = review.xpath(XPATH_REVIEW_POSTED_DATE)
raw_review_text1 = review.xpath(XPATH_REVIEW_TEXT_1)
raw_review_text2 = review.xpath(XPATH_REVIEW_TEXT_2)
raw_review_text3 = review.xpath(XPATH_REVIEW_TEXT_3)
author = ' '.join(' '.join(raw_review_author).split()).strip('By')
#cleaning data
review_rating = ''.join(raw_review_rating).replace('out of 5 stars','')
review_header = ' '.join(' '.join(raw_review_header).split())
review_posted_date = dateparser.parse(''.join(raw_review_posted_date)).strftime('%d %b %Y')
review_text = ' '.join(' '.join(raw_review_text1).split())
#grabbing hidden comments if present
if raw_review_text2:
json_loaded_review_data = json.loads(raw_review_text2[0])
json_loaded_review_data_text = json_loaded_review_data['rest']
cleaned_json_loaded_review_data_text = re.sub('<.*?>','',json_loaded_review_data_text)
full_review_text = review_text+cleaned_json_loaded_review_data_text
else:
full_review_text = review_text
if not raw_review_text1:
full_review_text = ' '.join(' '.join(raw_review_text3).split())
raw_review_comments = review.xpath(XPATH_REVIEW_COMMENTS)
review_comments = ''.join(raw_review_comments)
review_comments = re.sub('[A-Za-z]','',review_comments).strip()
review_dict = {
'review_comment_count':review_comments,
'review_text':full_review_text,
'review_posted_date':review_posted_date,
'review_header':review_header,
'review_rating':review_rating,
'review_author':author
}
reviews_list.append(review_dict)
data = {
'ratings':ratings_dict,
'reviews':reviews_list,
'url':amazon_url,
'price':product_price,
'name':product_name
}
return data
except ValueError:
print "Retrying to get the correct response"
return {"error":"failed to process the page","asin":asin}
def ReadAsin():
#Add your own ASINs here
AsinList = ['B01ETPUQ6E','B017HW9DEW']
extracted_data = []
for asin in AsinList:
print "Downloading and processing page http://www.amazon.com/dp/"+asin
extracted_data.append(ParseReviews(asin))
sleep(5)
f=open('data.json','w')
json.dump(extracted_data,f,indent=4)
if __name__ == '__main__':
ReadAsin()
@rohanpai

This comment has been minimized.

Show comment Hide comment
@rohanpai

rohanpai Jan 2, 2017

This script does not seem to work anymore. Any ideas why?

rohanpai commented Jan 2, 2017

This script does not seem to work anymore. Any ideas why?

@scrapehero

This comment has been minimized.

Show comment Hide comment
@scrapehero

scrapehero Feb 21, 2017

@rohanpai Just updated the code. At times you need a good proxy for this to work when you have more than 10 products, OR increase the delay.

Owner

scrapehero commented Feb 21, 2017

@rohanpai Just updated the code. At times you need a good proxy for this to work when you have more than 10 products, OR increase the delay.

@blubbi321

This comment has been minimized.

Show comment Hide comment
@blubbi321

blubbi321 Apr 7, 2017

Just returns the first "page" of reviews for me?

Just returns the first "page" of reviews for me?

@lehekavi

This comment has been minimized.

Show comment Hide comment
@lehekavi

lehekavi Jul 29, 2017

This scraper works but also only returns the first page of reviews. How do i get it to loop over all pages and return all reviews?
Thank you

This scraper works but also only returns the first page of reviews. How do i get it to loop over all pages and return all reviews?
Thank you

@sagargaikwad966

This comment has been minimized.

Show comment Hide comment
@sagargaikwad966

sagargaikwad966 Sep 1, 2017

thanks this code works but only for limited reviews. how to get all reviews?

thanks this code works but only for limited reviews. how to get all reviews?

@vscrape

This comment has been minimized.

Show comment Hide comment
@vscrape

vscrape Sep 22, 2017

Vscrape.com are providing amazon scraping tools without IP blocked and Banned.Using that tools any one can scrape million of records easily.
Below is Few Tools we provide

1.Amazon Scraping and Reprice tools
2.Amazon competitor products monitor tools
3.FBA scraping tools
4.Buybox Scraping tools
5.Amazon title modifications alert tools
6.Amazon to Ebay Price comparisons
7.Amazon to Ebay automatic scraping and listing tools and maintain price and stocks
8.Aliexpress to Ebay Automatic listing tools and maintain price and stocks
9.Walmart,Bhphotovideo,best buy and many other website to Ebay listing tools and maintain price and stocks
10.Ebay scraping tools and Tracking tools
11.ASIN track tools
12.Ebay Listing tools
13.Scrape million of data from any website. etc.....
based on your needs i can develop or modify this tools
Contact us for demo

#1 Web Scraping Software - Vscrape.com |‎ Free Developer Support

vscrape commented Sep 22, 2017

Vscrape.com are providing amazon scraping tools without IP blocked and Banned.Using that tools any one can scrape million of records easily.
Below is Few Tools we provide

1.Amazon Scraping and Reprice tools
2.Amazon competitor products monitor tools
3.FBA scraping tools
4.Buybox Scraping tools
5.Amazon title modifications alert tools
6.Amazon to Ebay Price comparisons
7.Amazon to Ebay automatic scraping and listing tools and maintain price and stocks
8.Aliexpress to Ebay Automatic listing tools and maintain price and stocks
9.Walmart,Bhphotovideo,best buy and many other website to Ebay listing tools and maintain price and stocks
10.Ebay scraping tools and Tracking tools
11.ASIN track tools
12.Ebay Listing tools
13.Scrape million of data from any website. etc.....
based on your needs i can develop or modify this tools
Contact us for demo

#1 Web Scraping Software - Vscrape.com |‎ Free Developer Support

@gir-git

This comment has been minimized.

Show comment Hide comment
@gir-git

gir-git Jan 22, 2018

Hey, may you explain what @data-hook, for example on line 26 is exactly meant for?

gir-git commented Jan 22, 2018

Hey, may you explain what @data-hook, for example on line 26 is exactly meant for?

@Madhvi2

This comment has been minimized.

Show comment Hide comment
@Madhvi2

Madhvi2 Mar 21, 2018

Please suggest the approach to follow for extraction of all the reviews.

Madhvi2 commented Mar 21, 2018

Please suggest the approach to follow for extraction of all the reviews.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment