Skip to content

Instantly share code, notes, and snippets.

@kinoute
Last active September 16, 2019 20:51
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kinoute/0dcc2ca5e9427d2ff1432bd7d388e095 to your computer and use it in GitHub Desktop.
Save kinoute/0dcc2ca5e9427d2ff1432bd7d388e095 to your computer and use it in GitHub Desktop.
Scrap reviews on Amazon.fr (only) with Python, Pandas & BeautifulSoup.
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import random
from urllib.parse import urlsplit
headers = requests.utils.default_headers()
headers.update(
{
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0',
}
)
urlProduct = input("Enter the Product URL on Amazon:\n")
base_url = "{0.scheme}://{0.netloc}".format(urlsplit(urlProduct))
requete = requests.get(urlProduct, headers=headers)
page = requete.content
soup = BeautifulSoup(page, "html.parser")
reviewsURL = base_url + soup.find('a', {'class': 'a-link-emphasis a-text-bold'})['href'] + "&pageNumber="
numbPage = soup.find('a', {'class': 'a-link-emphasis a-text-bold'}).string
numbPage = int(''.join([s for s in numbPage if s.isdigit()])) // 10
datas = ["user","rating","date", "review"]
csv_reviews = pd.DataFrame(columns=datas)
print("Starting...")
for x in range(1, numbPage):
pageNum = str(x)
urlReviews = reviewsURL + pageNum
requete = requests.get(urlReviews, headers=headers)
page = requete.content
soup = BeautifulSoup(page, "html.parser")
reviews = soup.findAll('div',{'class':'a-section celwidget'})
for review in reviews:
user = review.find('span', {'class': 'a-profile-name'}).string
rating = review.find('a', {'class': 'a-link-normal'}).string.split(' ')[0]
review_date = review.find('span', {'class': 'review-date'}).string
review_body = review.find('div', {'class': 'a-row a-spacing-small review-data'})
review_body = review_body.find('span').get_text("\n", strip=True)
csv_reviews.loc[len(csv_reviews)] = [user, rating, review_date, review_body]
time.sleep(random.randrange(8, 20))
print(csv_reviews)
csv_reviews.to_csv('results.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment