Skip to content

Instantly share code, notes, and snippets.

@pebueno
Created December 20, 2022 22:44
Show Gist options
  • Save pebueno/19c28e2c264cdf1f639c88112708474f to your computer and use it in GitHub Desktop.
Save pebueno/19c28e2c264cdf1f639c88112708474f to your computer and use it in GitHub Desktop.
Simple crawler to gather url's of a website
const axios = require('axios')
const cheerio = require('cheerio')
const urlParser = require("url");
const fs = require("fs");
function replaceCommaLine(data) {
//convert string to array and remove whitespace
let dataToArray = data.split(',').map(item => item.trim());
//convert array to string replacing comma with new line
return dataToArray.join("\n");
}
const crawl = {
url: "https://www.epocacosmeticos.com.br/marcas",
ignore: "/search",
};
const seenUrls = {};
const getUrl = (link, host, protocol) => {
if (link.includes("http")) {
return link;
} else if (link.startsWith("/")) {
return `${protocol}//${host}${link}`;
} else {
return `${protocol}//${host}/${link}`;
}
};
axios(crawl.url)
.then(response => {
const result = [];
const { host, protocol } = urlParser.parse(crawl.url);
if (seenUrls[crawl.url]) return;
console.log("crawling", crawl.url);
seenUrls[crawl.url] = true;
let ignore = crawl.ignore
const html = response.data
const $ = cheerio.load(html);
const links = $("a")
.map((i, link) => link.attribs.href)
.get();
links
.filter((link) => link.includes(host) && !link.includes(ignore))
.forEach((link) => {
result.push(link);
console.log(link+"\n");
Object.assign(crawl, {
url: getUrl(link, host, protocol),
ignore,
});
});
const data = replaceCommaLine(result.toString())
fs.writeFile("test.txt", data, function (err) {
if (err) return console.log(err);
});
}).catch(err => console.log(err))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment