Last active
September 27, 2019 12:53
-
-
Save h-jennings/964a3a40b219497a5d3c0216b2e985de to your computer and use it in GitHub Desktop.
Scraping all product headlines from a site
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const jsdom = require('jsdom'); | |
const { | |
JSDOM | |
} = jsdom; | |
const Nightmare = require('nightmare'); | |
const nightmare = Nightmare(); | |
const url = 'http://books.toscrape.com/'; | |
// Grabbing the homepage html | |
nightmare | |
.goto(url) | |
.wait('html') | |
.evaluate(() => document.querySelector('html').innerHTML) | |
.end() | |
.then(html => getLinkContainer(html) | |
.then(urls => { | |
// ! Debounce here ? idk | |
urls.forEach(url => getHTMLForEachUrl(url)) | |
}) | |
.catch(err => console.error(err)) | |
) | |
.catch(err => console.error(err)); | |
const getLinkContainer = (html) => { | |
return new Promise((resolve, reject) => { | |
const dom = new JSDOM(html); | |
const $ = dom.window.document; | |
const linkContainer = $.querySelector('.nav > li:nth-child(1) > ul:nth-child(2)'); | |
const linkNodeArray = linkContainer.querySelectorAll('a'); | |
const absoluteHrefArr = []; | |
linkNodeArray.forEach(linkNode => absoluteHrefArr.push(`http://books.toscrape.com/${linkNode.href}`)) | |
if (absoluteHrefArr.length > 1) { | |
resolve(absoluteHrefArr) | |
} else { | |
reject() | |
} | |
}) | |
} | |
// ! This is were a crazy amount of data gets process | |
// ! Need to figure out how to throttle these function calls | |
// Debounce perhaps? | |
const getHTMLForEachUrl = (url) => { | |
const nightmareInstance = new Nightmare(); | |
nightmareInstance | |
.goto(url) | |
.wait('html') | |
.evaluate(() => document.querySelector('html').innerHTML) | |
.end() | |
.then(html => grabProductNames(html)) | |
.catch(err => console.error(err)) | |
} | |
const grabProductNames = (html) => { | |
const dom = new JSDOM(html); | |
const productNameArr = []; | |
const $ = dom.window.document; | |
const documentUrl = dom.window.document.title; | |
const productNames = $.querySelectorAll('.product_pod > h3'); | |
productNames.forEach((product) => productNameArr.push({ | |
product: product.textContent, | |
page: documentUrl | |
})) | |
console.log(productNameArr); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment