Last active
April 4, 2024 01:53
-
-
Save franklindyer/16e73afd354b3ae3704344aaa8ab0c61 to your computer and use it in GitHub Desktop.
Broken link finder
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Utility for crawling (small) websites and detecting dead links | |
// Enter the URL of the homepage, or the desired starting page of the crawl. | |
// The bot will check validity of links to other domains, but will not recurse over them. | |
// Problematic links are printed to STDOUT. | |
// Progress info (# pending requests) is printed to STDERR. | |
import Queue from 'queue'; | |
import request from 'request'; | |
import follow_redirects from 'follow-redirects'; | |
const https = follow_redirects.https; | |
import url from 'url'; | |
import cheerio from 'cheerio'; | |
import readline from 'readline'; | |
const toVisitQ = new Queue(); // Queue of links to be visited | |
const alreadyAdded = {}; // Hash set of links that have already been visited | |
let pending = 0; // Number of pending HTTPS requests | |
const maxPending = 50; // Maximum number of pending requests at once | |
const USR_AGENT_STR = "Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405" | |
function progressCounter() { | |
console.error("Current pending requests: " + pending); | |
console.error("Current queue length: " + toVisitQ.length); | |
if (toVisitQ.length > 0 || pending > 0) { | |
setTimeout(progressCounter, 1000); | |
} | |
} | |
function scheduleVisit(topLevelDom) { | |
if (toVisitQ.length == 0 && pending == 0) { | |
process.exit(0); | |
} | |
if (toVisitQ.length > 0 && pending < maxPending) { | |
let nextRecord = toVisitQ.shift(); | |
checkLink(topLevelDom, nextRecord[1], nextRecord[0]); | |
scheduleVisit(topLevelDom); | |
} | |
setTimeout(() => scheduleVisit(topLevelDom), 100); | |
} | |
// Some URLs do not stand alone (e.g. relative URLs) | |
// This function attempts to "clean" these URLs to make them valid. | |
function sanitizeHref(topLevelDom, rawHref) { | |
if (rawHref == undefined) { | |
return false; | |
} if (rawHref.slice(0,5) == "https") { | |
return rawHref; | |
} else if (rawHref.slice(0,2) == "//") { | |
return "https:" + rawHref; | |
} else if (rawHref[0] == '/') { | |
return "https://" + topLevelDom + rawHref; | |
} else { | |
return false; | |
} | |
} | |
function customErrorChecks(res) { | |
// console.log(res.statusMessage); | |
if (res.statusCode == 429) { | |
// console.log("X-rate-limit-reset: " + JSON.stringify(res.headers)); | |
} | |
} | |
function checkLink(topLevelDom, nextUrl, parentUrl) { | |
alreadyAdded[nextUrl] = true; | |
nextUrl = encodeURI(decodeURI(nextUrl)); | |
var uri = url.parse(nextUrl); | |
var opts = { | |
hostname: uri.hostname, | |
port: uri.port, | |
path: uri.pathname, | |
headers: { | |
"User-Agent": USR_AGENT_STR // Some sites respond 403 to requests without User-Agent | |
} | |
}; | |
pending += 1; | |
let req = https.get(opts, function(res) { | |
pending += -1; | |
if (res.statusCode >= 400) { | |
console.log("Problematic link: " + nextUrl); | |
console.log("Linked from page: " + parentUrl); | |
console.log("Response code: " + res.statusCode + " " + res.statusMessage + "\n"); | |
customErrorChecks(res); | |
} else if (nextUrl.startsWith("https://" + topLevelDom)) { | |
res.on('data', function(body) { | |
let $ = cheerio.load(body); | |
let links = $('a'); | |
$(links).each(function(i, link) { | |
var href = sanitizeHref(topLevelDom, $(link).attr('href')); | |
if (!(href == false || href in alreadyAdded)) { | |
alreadyAdded[href] = true; | |
toVisitQ.push([nextUrl, href]); | |
} | |
}); | |
}); | |
} | |
}); | |
req.on('error', function(err) { | |
pending += -1; | |
console.log("Problematic link: " + nextUrl); | |
console.log("Linked from page: " + parentUrl); | |
console.log("Error: " + err.toString() + "\n"); | |
}); | |
// whenFinishedScraping(topLevelDom); | |
} | |
const rl = readline.createInterface({ | |
input: process.stdin, | |
output: process.stderr | |
}); | |
rl.question("URL of starting page for dead link test:\n", function(startUrl) { | |
var encUrl = encodeURI(startUrl); | |
var startUri = url.parse(encUrl); | |
toVisitQ.push(['', encUrl]); | |
scheduleVisit(startUri.hostname); | |
// checkLink(startUri.hostname, encUrl); | |
setTimeout(progressCounter, 100); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment