Skip to content

Instantly share code, notes, and snippets.

@buddyeorl
Last active October 9, 2020 00:53
Show Gist options
  • Save buddyeorl/4e7d8f80734739d524980f1e489e8b94 to your computer and use it in GitHub Desktop.
Save buddyeorl/4e7d8f80734739d524980f1e489e8b94 to your computer and use it in GitHub Desktop.
const fs = require('fs');
const util = require('util');
const axios = require('axios');
const cheerio = require('cheerio');
const url = ""; // <- i set the url to crawl here
const fetchData= async (url)=> {
await new Promise(resolve => {
//randomize crawling times
setTimeout(resolve, 5250 + Math.floor(Math.random() * Math.floor(20000)))
});
let response = await axios(url).catch((err) => console.log(err));
if (response.status !== 200) {
// I handled error here
}
return response;
}
const crawlFunc = async (url, elementData) => {
return await fetchData(url).then((res) => {
const html = res.data;
const $ = cheerio.load(html);
const statsTable = $(elementData);
let equipment = {};
let curText = ''
statsTable.each(function () {
$(this).find('a').each(function (i, elem) {
if (i === 0) {
curText = $(this).text().trim().replace(/[^A-Z0-9]/ig, "-").toLowerCase()
equipment[curText] = {}
return
}
if ($(this).text() !== '') {
equipment[curText][$(this).text()] = { url: elem.attribs.href };
}
});
});
fs.writeFileSync('./categories.js', 'let obj = ' + util.inspect(JSON.stringify(equipment)), 'utf-8')
return equipment
})
}
//create site category index file
crawlFunc();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment