Skip to content

Instantly share code, notes, and snippets.

@MartinMuzatko
Last active September 4, 2019 11:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MartinMuzatko/babef005cfb9e493536ac9bf030ee0be to your computer and use it in GitHub Desktop.
Save MartinMuzatko/babef005cfb9e493536ac9bf030ee0be to your computer and use it in GitHub Desktop.
// go to website, run script, abort when needed with `clearTimeout(interval)`
// harvest when done with console.log(scrapes)
$i('axios')
const base = `http://www.echojs.com/latest/`
const timeout = 500
async function scrape(id = 0) {
const { data } = await axios.get(base + id)
document.body.innerHTML = data
const articles = document.body.querySelectorAll('#newslist article')
return [...articles].map(a => ({headline: a.querySelector('h2').textContent, user: a.querySelector('username').textContent, upvotes: a.querySelector('.upvotes').textContent, downvotes: a.querySelector('.downvotes').textContent, href: a.querySelector('h2 a').href }))
}
let scrapes = []
let id = 0
let interval = null
async function scrapeAll() {
scrapes = scrapes.concat(await scrape(id))
id += 30
console.log(scrapes.length)
interval = setTimeout(scrapeAll, timeout)
}
scrapeAll()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment