Skip to content

Instantly share code, notes, and snippets.

@triposat
Created October 27, 2023 05:05
Show Gist options
  • Save triposat/20706d61989a4031669c2e3d25f487d0 to your computer and use it in GitHub Desktop.
Save triposat/20706d61989a4031669c2e3d25f487d0 to your computer and use it in GitHub Desktop.
How to Scrape Amazon Product Reviews Behind a Login
// Import the necessary libraries.
const puppeteer = require("puppeteer");
const fs = require("fs");
// Define the selectors for the elements we need to extract.
const selectors = {
allReviews: '#cm-cr-dp-review-list div.review',
authorName: 'div[data-hook="genome-widget"] span.a-profile-name',
reviewTitle: '[data-hook=review-title]>span:not([class])',
reviewDate: 'span[data-hook=review-date]',
emailid: 'input[name=email]',
password: 'input[name=password]',
continue: 'input[id=continue]',
singin: 'input[id=signInSubmit]',
};
// Asynchronously fetch the Amazon reviews.
async function fetchAmazonReviews() {
// Launch a Puppeteer browser.
const browser = await puppeteer.launch({
// Set headless to false so we can see the browser in action.
headless: false,
});
// Create a new page in the browser.
const page = await browser.newPage();
// Navigate to the Amazon sign-in page.
await page.goto("https://www.amazon.com/ap/signin?openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.amazon.com%2FENHANCE-Headphone-Customizable%2Fdp%2FB07DR59JLP%2F%3Fref_%3Dnav_custrec_signin&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.assoc_handle=usflex&openid.mode=checkid_setup&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0");
// Wait for the email input field to be loaded.
await page.waitForSelector(selectors.emailid);
// Type your email address into the email input field.
await page.type(selectors.emailid, "satyam@gmail.com", { delay: 100 });
// Click the continue button.
await page.click(selectors.continue);
// Wait for the password input field to be loaded.
await page.waitForSelector(selectors.password);
// Type your password into the password input field.
await page.type(selectors.password, "mypassword", { delay: 100 });
// Click the sign-in button.
await page.click(selectors.singin);
// Wait for the page to navigate to the product page.
await page.waitForNavigation();
// Navigate to the product page for which you want to fetch the reviews.
await page.goto("https://www.amazon.com/ENHANCE-Headphone-Customizable-Lighting-Flexible/dp/B07DR59JLP");
// Wait for the allReviews selector to be loaded.
await page.waitForSelector(selectors.allReviews);
// Get all of the review elements on the page.
const reviewElements = await page.$$(selectors.allReviews);
// Create an empty array to store the review data.
const reviewsData = [];
// Iterate over the review elements and extract the author, title, and date for each review.
for (const reviewElement of reviewElements) {
// Get the author name.
const author = await reviewElement.$eval(selectors.authorName, (element) => element.textContent);
// Get the review title.
const title = await reviewElement.$eval(selectors.reviewTitle, (element) => element.textContent);
// Get the raw review date.
const rawReviewDate = await reviewElement.$eval(selectors.reviewDate, (element) => element.textContent);
// Extract the date from the raw review date.
const datePattern = /(\w+\s\d{1,2},\s\d{4})/;
const match = rawReviewDate.match(datePattern);
const reviewDate = match ? match[0].replace(',', '') : "Date not found";
// Create a review data object.
const reviewData = {
author,
title,
reviewDate,
};
// Add the review data object to the reviewsData array.
reviewsData.push(reviewData);
}
// Create the CSV content.
let csvContent = "Author,Title,Date\n";
// Iterate over the reviews data and add it to the CSV content.
for (const review of reviewsData) {
const { author, title, reviewDate } = review;
csvContent += `${author},"${title}",${reviewDate}\n`;
}
// Write the CSV content to a file.
const csvFileName = "amazon_reviews.csv";
await fs.writeFileSync(csvFileName, csvContent, "utf8");
// Log a message to the console indicating that the CSV file has been created.
console.log('CSV file created!');
await browser.close()
}
// Call the fetchAmazonReviews function.
fetchAmazonReviews();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment