Skip to content

Instantly share code, notes, and snippets.

@scrapehero
Last active November 16, 2022 03:43
Show Gist options
  • Save scrapehero/900419a768c5fac9ebdef4cb246b25cb to your computer and use it in GitHub Desktop.
Save scrapehero/900419a768c5fac9ebdef4cb246b25cb to your computer and use it in GitHub Desktop.
Python 3 code to extract amazon reviews
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Written as part of https://www.scrapehero.com/how-to-scrape-amazon-product-reviews-using-python/
from lxml import html
from json import dump,loads
from requests import get
import json
from re import sub
from dateutil import parser as dateparser
from time import sleep
def ParseReviews(asin):
# This script has only been tested with Amazon.com
amazon_url = 'http://www.amazon.com/dp/'+asin
# Add some recent user agent to prevent amazon from blocking the request
# Find some chrome user agent strings here https://udger.com/resources/ua-list/browser-detail?browser=Chrome
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
for i in range(5):
response = get(amazon_url, headers = headers, verify=False, timeout=30)
if response.status_code == 404:
return {"url": amazon_url, "error": "page not found"}
if response.status_code != 200:
continue
# Removing the null bytes from the response.
cleaned_response = response.text.replace('\x00', '')
parser = html.fromstring(cleaned_response)
XPATH_AGGREGATE = '//span[@id="acrCustomerReviewText"]'
XPATH_REVIEW_SECTION_1 = '//div[contains(@id,"reviews-summary")]'
XPATH_REVIEW_SECTION_2 = '//div[@data-hook="review"]'
XPATH_AGGREGATE_RATING = '//table[@id="histogramTable"]//tr'
XPATH_PRODUCT_NAME = '//h1//span[@id="productTitle"]//text()'
XPATH_PRODUCT_PRICE = '//span[@id="priceblock_ourprice"]/text()'
raw_product_price = parser.xpath(XPATH_PRODUCT_PRICE)
raw_product_name = parser.xpath(XPATH_PRODUCT_NAME)
total_ratings = parser.xpath(XPATH_AGGREGATE_RATING)
reviews = parser.xpath(XPATH_REVIEW_SECTION_1)
product_price = ''.join(raw_product_price).replace(',', '')
product_name = ''.join(raw_product_name).strip()
if not reviews:
reviews = parser.xpath(XPATH_REVIEW_SECTION_2)
ratings_dict = {}
reviews_list = []
# Grabing the rating section in product page
for ratings in total_ratings:
extracted_rating = ratings.xpath('./td//a//text()')
if extracted_rating:
rating_key = extracted_rating[0]
raw_raing_value = extracted_rating[1]
rating_value = raw_raing_value
if rating_key:
ratings_dict.update({rating_key: rating_value})
# Parsing individual reviews
for review in reviews:
XPATH_RATING = './/i[@data-hook="review-star-rating"]//text()'
XPATH_REVIEW_HEADER = './/a[@data-hook="review-title"]//text()'
XPATH_REVIEW_POSTED_DATE = './/span[@data-hook="review-date"]//text()'
XPATH_REVIEW_TEXT_1 = './/div[@data-hook="review-collapsed"]//text()'
XPATH_REVIEW_TEXT_2 = './/div//span[@data-action="columnbalancing-showfullreview"]/@data-columnbalancing-showfullreview'
XPATH_REVIEW_COMMENTS = './/span[@data-hook="review-comment"]//text()'
XPATH_AUTHOR = './/span[contains(@class,"profile-name")]//text()'
XPATH_REVIEW_TEXT_3 = './/div[contains(@id,"dpReviews")]/div/text()'
raw_review_author = review.xpath(XPATH_AUTHOR)
raw_review_rating = review.xpath(XPATH_RATING)
raw_review_header = review.xpath(XPATH_REVIEW_HEADER)
raw_review_posted_date = review.xpath(XPATH_REVIEW_POSTED_DATE)
raw_review_text1 = review.xpath(XPATH_REVIEW_TEXT_1)
raw_review_text2 = review.xpath(XPATH_REVIEW_TEXT_2)
raw_review_text3 = review.xpath(XPATH_REVIEW_TEXT_3)
# Cleaning data
author = ' '.join(' '.join(raw_review_author).split())
review_rating = ''.join(raw_review_rating).replace('out of 5 stars', '')
review_header = ' '.join(' '.join(raw_review_header).split())
try:
review_posted_date = dateparser.parse(''.join(raw_review_posted_date)).strftime('%d %b %Y')
except:
review_posted_date = None
review_text = ' '.join(' '.join(raw_review_text1).split())
# Grabbing hidden comments if present
if raw_review_text2:
json_loaded_review_data = loads(raw_review_text2[0])
json_loaded_review_data_text = json_loaded_review_data['rest']
cleaned_json_loaded_review_data_text = re.sub('<.*?>', '', json_loaded_review_data_text)
full_review_text = review_text+cleaned_json_loaded_review_data_text
else:
full_review_text = review_text
if not raw_review_text1:
full_review_text = ' '.join(' '.join(raw_review_text3).split())
raw_review_comments = review.xpath(XPATH_REVIEW_COMMENTS)
review_comments = ''.join(raw_review_comments)
review_comments = sub('[A-Za-z]', '', review_comments).strip()
review_dict = {
'review_comment_count': review_comments,
'review_text': full_review_text,
'review_posted_date': review_posted_date,
'review_header': review_header,
'review_rating': review_rating,
'review_author': author
}
reviews_list.append(review_dict)
data = {
'ratings': ratings_dict,
'reviews': reviews_list,
'url': amazon_url,
'name': product_name,
'price': product_price
}
return data
return {"error": "failed to process the page", "url": amazon_url}
def ReadAsin():
# Add your own ASINs here
AsinList = ['B01ETPUQ6E', 'B017HW9DEW', 'B00U8KSIOM']
extracted_data = []
for asin in AsinList:
print("Downloading and processing page http://www.amazon.com/dp/" + asin)
extracted_data.append(ParseReviews(asin))
sleep(5)
f = open('data.json', 'w')
dump(extracted_data, f, indent=4)
f.close()
if __name__ == '__main__':
ReadAsin()
@Anki89
Copy link

Anki89 commented Mar 14, 2018

I am new write programs using Python. For python code i wants to scrape amazon data by UPC not ASIN. How can i do it? i need to scrape Title, Manufacturer,Brand name, Image, Item Weight, Price. Need help . Is there any Suggestion please?

from lxml import html
import csv,os,json
import requests
from exceptions import ValueError
from time import sleep

def AmzonParser(url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
page = requests.get(url,headers=headers)
while True:
sleep(3)
try:
doc = html.fromstring(page.content)
XPATH_NAME = '//h1[@id="title"]//text()'
XPATH_SALE_PRICE = '//span[contains(@id,"ourprice") or contains(@id,"saleprice")]/text()'
XPATH_ORIGINAL_PRICE = '//td[contains(text(),"List Price") or contains(text(),"M.R.P") or contains(text(),"Price")]/following-sibling::td/text()'
XPATH_CATEGORY = '//a[@Class="a-link-normal a-color-tertiary"]//text()'
XPATH_AVAILABILITY = '//div[@id="availability"]//text()'

		RAW_NAME = doc.xpath(XPATH_NAME)
		RAW_SALE_PRICE = doc.xpath(XPATH_SALE_PRICE)
		RAW_CATEGORY = doc.xpath(XPATH_CATEGORY)
		RAW_ORIGINAL_PRICE = doc.xpath(XPATH_ORIGINAL_PRICE)
		RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)

		NAME = ' '.join(''.join(RAW_NAME).split()) if RAW_NAME else None
		SALE_PRICE = ' '.join(''.join(RAW_SALE_PRICE).split()).strip() if RAW_SALE_PRICE else None
		CATEGORY = ' > '.join([i.strip() for i in RAW_CATEGORY]) if RAW_CATEGORY else None
		ORIGINAL_PRICE = ''.join(RAW_ORIGINAL_PRICE).strip() if RAW_ORIGINAL_PRICE else None
		AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY else None

		if not ORIGINAL_PRICE:
			ORIGINAL_PRICE = SALE_PRICE

		if page.status_code!=200:
			raise ValueError('captha')
		data = {
				'NAME':NAME,
				'SALE_PRICE':SALE_PRICE,
				'CATEGORY':CATEGORY,
				'ORIGINAL_PRICE':ORIGINAL_PRICE,
				'AVAILABILITY':AVAILABILITY,
				'URL':url,
				}

		return data
	except Exception as e:
		print e

def ReadAsin():
# AsinList = csv.DictReader(open(os.path.join(os.path.dirname(file),"Asinfeed.csv")))
AsinList = ['B0046UR4F4',
'B00JGTVU5A',
'B00GJYCIVK',
'B00EPGK7CQ',
'B00EPGKA4G',
'B00YW5DLB4',
'B00KGD0628',
'B00O9A48N2',
'B00O9A4MEW',
'B00UZKG8QU',]
extracted_data = []
for i in AsinList:
url = "http://www.amazon.com/dp/"+i
print "Processing: "+url
extracted_data.append(AmzonParser(url))
sleep(5)
f=open('data.json','w')
json.dump(extracted_data,f,indent=4)

if name == "main":
ReadAsin()

@DavidRoldan523
Copy link

https://github.com/DavidRoldan523/amazon_reviews_allpages

This code is a Script to scrape all reviews on all Amazon pages

@cjgalvisc96
@DavidRoldan523

@petersamoaa
Copy link

https://github.com/DavidRoldan523/amazon_reviews_allpages

This code is a Script to scrape all reviews on all Amazon pages

@cjgalvisc96
@DavidRoldan523

Sorry but the link you mentioned is not found

@DavidRoldan523
Copy link

DavidRoldan523 commented May 21, 2019 via email

@petersamoaa
Copy link

Hi there, The link does work https://github.com/DavidRoldan523/amazon_reviews_allpages *My repository on github: * https://github.com/DavidRoldan523?tab=repositories El mar., 21 de may. de 2019 a la(s) 05:44, Hazem Samoaa ( notifications@github.com) escribió:

https://github.com/DavidRoldan523/amazon_reviews_allpages http://url This code is a Script to scrape all reviews on all Amazon pages @cjgalvisc96 https://github.com/cjgalvisc96 @DavidRoldan523 https://github.com/DavidRoldan523 Sorry but the link you mentioned is not found — You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://gist.github.com/900419a768c5fac9ebdef4cb246b25cb?email_source=notifications&email_token=AH7T6GHZ4EG6HTF4QJGGR73PWPHAFA5CNFSM4HIOGSUKYY3PNVWWK3TUL52HS4DFVNDWS43UINXW23LFNZ2KUY3PNVWWK3TUL5UWJTQAFSMQG#gistcomment-2922755, or mute the thread https://github.com/notifications/unsubscribe-auth/AH7T6GHK2XN4673RGNFFWKTPWPHAFANCNFSM4HIOGSUA .

Thanks a lot for response
is it possible to get products asin so I can look for the products that I prefer. and in case I want to collect reviews for specific brand based on specific category does your code work (Huawei as brand, electronics and accessories as category )

@DavidRoldan523
Copy link

DavidRoldan523 commented May 21, 2019 via email

@DavidRoldan523
Copy link

DavidRoldan523 commented Jun 17, 2019 via email

@amritadey
Copy link

amritadey commented Jun 19, 2019 via email

@wanyi233
Copy link

Hi David,

Thanks for sharing the code! There seems a small typo on line 71. Should it be "if int(data['number_reviews']) % 2 == 0" rather than "if number_page_reviews % 2 == 0"?

Another quick question is that there seems a maximum limit of 5,000 reviews that can be extracted. For example, for this Instant Pot pressure cooker (ASIN: B00FLYWNYQ), it has over 34k review, but I can only extract 5000 from it.

Thanks!

@dnoftzger
Copy link

Hi David,

I was able to successfully apply your code to a few ASINs. However, isn't your code supposed to produce a json or csv file? I cannot find one anywhere after running the code. The only thing I changed were the ASINs listed and the code ran successfully. I figured I would find it in my jupyter notebooks 'Files' tab. Also looked in my downloaded files in Windows. No luck... Any advice on what I ought to do from here?

@amritadey
Copy link

amritadey commented Jul 13, 2020 via email

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment