Created
October 27, 2023 05:05
-
-
Save triposat/20706d61989a4031669c2e3d25f487d0 to your computer and use it in GitHub Desktop.
How to Scrape Amazon Product Reviews Behind a Login
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Import the necessary libraries. | |
const puppeteer = require("puppeteer"); | |
const fs = require("fs"); | |
// Define the selectors for the elements we need to extract. | |
const selectors = { | |
allReviews: '#cm-cr-dp-review-list div.review', | |
authorName: 'div[data-hook="genome-widget"] span.a-profile-name', | |
reviewTitle: '[data-hook=review-title]>span:not([class])', | |
reviewDate: 'span[data-hook=review-date]', | |
emailid: 'input[name=email]', | |
password: 'input[name=password]', | |
continue: 'input[id=continue]', | |
singin: 'input[id=signInSubmit]', | |
}; | |
// Asynchronously fetch the Amazon reviews. | |
async function fetchAmazonReviews() { | |
// Launch a Puppeteer browser. | |
const browser = await puppeteer.launch({ | |
// Set headless to false so we can see the browser in action. | |
headless: false, | |
}); | |
// Create a new page in the browser. | |
const page = await browser.newPage(); | |
// Navigate to the Amazon sign-in page. | |
await page.goto("https://www.amazon.com/ap/signin?openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.amazon.com%2FENHANCE-Headphone-Customizable%2Fdp%2FB07DR59JLP%2F%3Fref_%3Dnav_custrec_signin&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.assoc_handle=usflex&openid.mode=checkid_setup&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0"); | |
// Wait for the email input field to be loaded. | |
await page.waitForSelector(selectors.emailid); | |
// Type your email address into the email input field. | |
await page.type(selectors.emailid, "satyam@gmail.com", { delay: 100 }); | |
// Click the continue button. | |
await page.click(selectors.continue); | |
// Wait for the password input field to be loaded. | |
await page.waitForSelector(selectors.password); | |
// Type your password into the password input field. | |
await page.type(selectors.password, "mypassword", { delay: 100 }); | |
// Click the sign-in button. | |
await page.click(selectors.singin); | |
// Wait for the page to navigate to the product page. | |
await page.waitForNavigation(); | |
// Navigate to the product page for which you want to fetch the reviews. | |
await page.goto("https://www.amazon.com/ENHANCE-Headphone-Customizable-Lighting-Flexible/dp/B07DR59JLP"); | |
// Wait for the allReviews selector to be loaded. | |
await page.waitForSelector(selectors.allReviews); | |
// Get all of the review elements on the page. | |
const reviewElements = await page.$$(selectors.allReviews); | |
// Create an empty array to store the review data. | |
const reviewsData = []; | |
// Iterate over the review elements and extract the author, title, and date for each review. | |
for (const reviewElement of reviewElements) { | |
// Get the author name. | |
const author = await reviewElement.$eval(selectors.authorName, (element) => element.textContent); | |
// Get the review title. | |
const title = await reviewElement.$eval(selectors.reviewTitle, (element) => element.textContent); | |
// Get the raw review date. | |
const rawReviewDate = await reviewElement.$eval(selectors.reviewDate, (element) => element.textContent); | |
// Extract the date from the raw review date. | |
const datePattern = /(\w+\s\d{1,2},\s\d{4})/; | |
const match = rawReviewDate.match(datePattern); | |
const reviewDate = match ? match[0].replace(',', '') : "Date not found"; | |
// Create a review data object. | |
const reviewData = { | |
author, | |
title, | |
reviewDate, | |
}; | |
// Add the review data object to the reviewsData array. | |
reviewsData.push(reviewData); | |
} | |
// Create the CSV content. | |
let csvContent = "Author,Title,Date\n"; | |
// Iterate over the reviews data and add it to the CSV content. | |
for (const review of reviewsData) { | |
const { author, title, reviewDate } = review; | |
csvContent += `${author},"${title}",${reviewDate}\n`; | |
} | |
// Write the CSV content to a file. | |
const csvFileName = "amazon_reviews.csv"; | |
await fs.writeFileSync(csvFileName, csvContent, "utf8"); | |
// Log a message to the console indicating that the CSV file has been created. | |
console.log('CSV file created!'); | |
await browser.close() | |
} | |
// Call the fetchAmazonReviews function. | |
fetchAmazonReviews(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment