Skip to content

Instantly share code, notes, and snippets.

@humamfauzi
Created October 30, 2018 10:44
Show Gist options
  • Save humamfauzi/f8c1a9e26c6e0d80a24edc28e7ad14ab to your computer and use it in GitHub Desktop.
Save humamfauzi/f8c1a9e26c6e0d80a24edc28e7ad14ab to your computer and use it in GitHub Desktop.
Scrapping Basic with Pupetteer
const puppeteer = require('puppeteer');
let scrape = async (browser, link, returnLink) => {
try {
const page = await browser.newPage();
await page.goto(link, {timeout: 300000});
const result = await page.evaluate(ScrapperSchema.frontPage);
if (returnLink) {
const links = await page.evaluate(ScrapperSchema.listingLink);
await page.goto("about:blank")
page.close();
return {"page": link, "price": result, "links": links};
} else {
await page.goto("about:blank")
page.close();
return {"page": link, "price": result};
}
} catch (e) {
console.error(e)
}
};
let ScrapperSchema = {
frontPage: () => {
try {
let data = []; // Create an empty array that will store our data
let title = document.querySelectorAll("h3 > a");
let price = document.querySelectorAll(".price_color");
for (var i = 0; i < title.length; i++) {
let a = title[i].title;
let b = price[i].innerText;
data.push({title: a, price: b});
}
return data
} catch (e) {
console.error(e)
}
},
listingLink: () => {
try {
let data = [];
let link = document.querySelectorAll("a");
for (var i = 0; i < link.length; i++) {
if (link[i].href.indexOf("http://books.toscrape.com/catalogue/category/books/") != -1) {
data.push(link[i].href)
}
}
return data
} catch(e) {
console.error(e)
}
}
};
// +------------------------------------+
// MAIN
// +------------------------------------+
(async () => {
const browser = await puppeteer.launch({headless: true});
let frontPage = await scrape(browser, "http://books.toscrape.com/", true);
let collection = await Promise.all(frontPage.links.map(elem => {
console.log("Collecting from", elem)
return scrape(browser, elem, false)
}));
await browser.close()
await console.log(JSON.stringify(collection, null, 4));
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment