Skip to content

Instantly share code, notes, and snippets.

@jeffscottward
Created August 5, 2019 15:58
Show Gist options
  • Save jeffscottward/9a67f070b9be7a2da430cd88fd2962f2 to your computer and use it in GitHub Desktop.
Save jeffscottward/9a67f070b9be7a2da430cd88fd2962f2 to your computer and use it in GitHub Desktop.
Get Clean DOM for WebScrapping
const cheerio = require('cheerio')
const cleaner = require('clean-html')
const axios = require('axios')
module.exports = async function getCleanDOM (url) {
// Wrap in a recusive retry func
async function retry() {
try {
// AJAX to get HTML
this.siteRequest = await axios.get(url)
this.siteURLHTML = this.siteRequest.data
// Clean it for sanity
cleaner.clean(this.siteURLHTML, html => {
this.siteURLHTMLClean = html
})
// Send back page DOM
this.siteURLDOM = cheerio.load(this.siteURLHTMLClean)
return this.siteURLDOM
} catch (err) {
console.log(`ERROR - TRYING ${url} AGAIN`)
if(!err.response.status === 404) {
retry()
} else {
return '404'
}
}
}
return retry()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment