Skip to content

Instantly share code, notes, and snippets.

@techhjork
Last active August 16, 2022 20:41
Show Gist options
  • Save techhjork/9728154ff301eaedcd5798196bb2a59c to your computer and use it in GitHub Desktop.
Save techhjork/9728154ff301eaedcd5798196bb2a59c to your computer and use it in GitHub Desktop.
webscraping Nodejs structure
const puppeteer = require("puppeteer");
const express = require("express");
const app = express();
async function initPuppeteer() {
const browser = await puppeteer.launch({
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
return { browser, page };
}
async function goToURL(page, url, options = { waitUntil: "load" }) {
console.time(`Go to ${url}`);
const u = url.startsWith("http") ? url : "http://" + url;
await page.goto(u, options);
console.timeEnd(`Go to ${url}`);
}
async function getProducts(url) {
const { browser, page } = await initPuppeteer();
await goToURL(page, "https://www.eichholtz.com/en/collection/new/new-arrivals.html?p=1");
const res = await page.$$eval(".product-item-info", (productInfo) =>
productInfo.map((product) => {
return {
url: product.querySelector(".product.photo.product-item-photo").href,
img: product.querySelector(".product-image-photo").src,
productSku: product.querySelector(".product-sku-value").innerText,
productSizes: product.querySelector(".product-item-link").innerText,
}
})
);
await browser.close();
return res;
}
async function getPagination(url){
const paginationInfo = [];
const { browser, page } = await initPuppeteer();
await goToURL(page, "https://www.eichholtz.com/en/collection/new/new-arrivals.html?p=1");
for (i = 1; i <= 3; i++) {
var perPageProduct = await page.waitForSelector("#toolbar-amount .toolbar-number:nth-child(" + i + ")");
var text = await page.evaluate((element) => parseInt(element.textContent),perPageProduct);
paginationInfo.push(text);
}
let noOfPages = Math.ceil(paginationInfo[2] / paginationInfo[1]);
let lastPageProduct = paginationInfo[2]%paginationInfo[1];
await browser.close();
return{
perPageProduct:paginationInfo[1],
totalProduct:paginationInfo[2],
productLastPage:lastPageProduct,
totalPages:noOfPages,
}
}
app.get("/",async (req,res)=>{
let data = await getData("https://www.eichholtz.com/en/collection/new/new-arrivals.html?p=1")
res.json(data);
})
app.get("/pagination",async (req,res)=>{
let da = await getPagination("https://www.eichholtz.com/en/collection/new/new-arrivals.html?p=1")
res.json(da);
})
app.listen(3000, (err) =>{
console.log("localhost:3000")
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment