Created
August 5, 2019 15:58
-
-
Save jeffscottward/9a67f070b9be7a2da430cd88fd2962f2 to your computer and use it in GitHub Desktop.
Get Clean DOM for WebScrapping
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const cheerio = require('cheerio') | |
const cleaner = require('clean-html') | |
const axios = require('axios') | |
module.exports = async function getCleanDOM (url) { | |
// Wrap in a recusive retry func | |
async function retry() { | |
try { | |
// AJAX to get HTML | |
this.siteRequest = await axios.get(url) | |
this.siteURLHTML = this.siteRequest.data | |
// Clean it for sanity | |
cleaner.clean(this.siteURLHTML, html => { | |
this.siteURLHTMLClean = html | |
}) | |
// Send back page DOM | |
this.siteURLDOM = cheerio.load(this.siteURLHTMLClean) | |
return this.siteURLDOM | |
} catch (err) { | |
console.log(`ERROR - TRYING ${url} AGAIN`) | |
if(!err.response.status === 404) { | |
retry() | |
} else { | |
return '404' | |
} | |
} | |
} | |
return retry() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment