Skip to content

Instantly share code, notes, and snippets.

@franzenzenhofer
Created June 1, 2023 14:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save franzenzenhofer/11f95b157053b375a02f2023ca9cdebe to your computer and use it in GitHub Desktop.
Save franzenzenhofer/11f95b157053b375a02f2023ca9cdebe to your computer and use it in GitHub Desktop.
const axios = require('axios');
const cheerio = require('cheerio');
const argv = require('yargs').argv;
const sleep = require('util').promisify(setTimeout);
const start_url = argv._[0] || '';
const delay = argv.s || 1;
const headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
};
async function getHtml(url) {
try {
const response = await axios.get(url, { headers });
if (response.status !== 200) {
console.error(`Received status code ${response.status} when fetching ${url}`);
return null;
}
return cheerio.load(response.data);
} catch (error) {
console.error(`Error fetching ${url}: ${error.message}`);
return null;
}
}
async function getLinks($) {
const links = [];
$('a').each((i, link) => {
const href = $(link).attr('href');
if (href && href.startsWith('/t/')) {
links.push(href.startsWith('http') ? href : `${start_url}${href}`);
}
});
return links;
}
async function checkBacklink(url) {
await sleep(delay * 1000);
const $ = await getHtml(url);
if (!$) return false;
const links = await getLinks($);
return links.includes(start_url);
}
async function crawl(url) {
const $ = await getHtml(url);
if (!$) return;
const links = await getLinks($);
for (const link of links) {
const hasBacklink = await checkBacklink(link);
if (hasBacklink) {
console.log(`${link} links back to ${url}`);
} else {
console.log(`${link} does not link back to ${url}`);
}
}
}
crawl(start_url);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment