Skip to content

Instantly share code, notes, and snippets.

@h-jennings
Last active September 27, 2019 12:53
Show Gist options
  • Save h-jennings/964a3a40b219497a5d3c0216b2e985de to your computer and use it in GitHub Desktop.
Save h-jennings/964a3a40b219497a5d3c0216b2e985de to your computer and use it in GitHub Desktop.
Scraping all product headlines from a site
const jsdom = require('jsdom');
const {
JSDOM
} = jsdom;
const Nightmare = require('nightmare');
const nightmare = Nightmare();
const url = 'http://books.toscrape.com/';
// Grabbing the homepage html
nightmare
.goto(url)
.wait('html')
.evaluate(() => document.querySelector('html').innerHTML)
.end()
.then(html => getLinkContainer(html)
.then(urls => {
// ! Debounce here ? idk
urls.forEach(url => getHTMLForEachUrl(url))
})
.catch(err => console.error(err))
)
.catch(err => console.error(err));
const getLinkContainer = (html) => {
return new Promise((resolve, reject) => {
const dom = new JSDOM(html);
const $ = dom.window.document;
const linkContainer = $.querySelector('.nav > li:nth-child(1) > ul:nth-child(2)');
const linkNodeArray = linkContainer.querySelectorAll('a');
const absoluteHrefArr = [];
linkNodeArray.forEach(linkNode => absoluteHrefArr.push(`http://books.toscrape.com/${linkNode.href}`))
if (absoluteHrefArr.length > 1) {
resolve(absoluteHrefArr)
} else {
reject()
}
})
}
// ! This is were a crazy amount of data gets process
// ! Need to figure out how to throttle these function calls
// Debounce perhaps?
const getHTMLForEachUrl = (url) => {
const nightmareInstance = new Nightmare();
nightmareInstance
.goto(url)
.wait('html')
.evaluate(() => document.querySelector('html').innerHTML)
.end()
.then(html => grabProductNames(html))
.catch(err => console.error(err))
}
const grabProductNames = (html) => {
const dom = new JSDOM(html);
const productNameArr = [];
const $ = dom.window.document;
const documentUrl = dom.window.document.title;
const productNames = $.querySelectorAll('.product_pod > h3');
productNames.forEach((product) => productNameArr.push({
product: product.textContent,
page: documentUrl
}))
console.log(productNameArr);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment