Skip to content

Instantly share code, notes, and snippets.

@tomhodgins
Last active July 1, 2020 03:59
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save tomhodgins/e6c782505d5cfcd29e6ee093441ad106 to your computer and use it in GitHub Desktop.
Save tomhodgins/e6c782505d5cfcd29e6ee093441ad106 to your computer and use it in GitHub Desktop.
const scraper = require('./scrape.js')
const models = require('./models.js')
const processModels = models => Promise.all(
models.map(model => processLinks(model, model.urls))
).then(
result => console.log(JSON.stringify(result))
)
const processLinks = (model, urls) => Promise.all(
urls.map(url => scraper(model, url))
)
processModels(models)
module.exports = [
// Beer Advocate
{
domain: 'beeradvocate.com',
abv: [{type: 'xpath', selector: `//*[@id="info_box"]//b[contains(text(), "ABV")]/following-sibling::text()[1]`}],
urls: [
'https://www.beeradvocate.com/beer/profile/35/101/',
]
}
]
[
[
{
"url": "https://www.beeradvocate.com/beer/profile/35/101/",
"abv": ["5.60%"]
}
]
]
const puppeteer = require('puppeteer')
module.exports = function(
model = {},
url = ''
) {
return (async () => {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto(url, {waitUntil: 'networkidle2'})
const result = await page.evaluate(
(model, url) => {
const getValues = selectors => selectors.reduce(
(acc, selector) => [
...acc,
...Array.from(
selector.type === 'css'
? document.querySelectorAll(selector.selector)
: queryXPathAll(selector.selector)
).map(
tag => tag.textContent.replace(/\s(\s+)/g, '').trim()
).filter(
text => text
)
],
[]
)
const queryXPathAll = path => {
const nodes = []
const xpath = document.evaluate(
path,
document,
null,
XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,
null
)
for (let i = 0; i < xpath.snapshotLength; i++) {
nodes.push(xpath.snapshotItem(i))
}
return nodes
}
const output = {}
output.url = url
for (let data in model) {
if (data !== 'domain' && data !== 'urls') {
output[data] = getValues(model[data])
}
}
return output
},
model,
url
)
await browser.close()
return result
})()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment