Skip to content

Instantly share code, notes, and snippets.

@BillGR17
Last active November 20, 2020 13:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save BillGR17/f349e4d00d01116e6dc75eaa4834bda4 to your computer and use it in GitHub Desktop.
Save BillGR17/f349e4d00d01116e6dc75eaa4834bda4 to your computer and use it in GitHub Desktop.
Checks website links [no external links]
const http = require("http"),
https = require("https"),
url = require("url"),
ifaces = require("os").networkInterfaces();
let localhost;
Object.keys(ifaces).forEach(dev => {
ifaces[dev].filter(details => {
if (details.family === "IPv4" && details.internal === false) {
localhost = details.address;
}
});
});
//Saves the user's URL
let domain;
//check if URL is correct
//its only used once for the user URL
//but it can be used on clean function...
function validURL(str) {
var pattern = new RegExp("^(https|http):\\/\\/" + // protocol
"((([a-z\\d]([a-z\\d-]*[a-z\\d])*)\\.)+[a-z]{2,}|" + // domain name
"((\\d{1,3}\\.){3}\\d{1,3}))" + // or IP (v4) address
"(\\:\\d+)?(\\/[-a-z\\d%_.~+]*)*" + // port and path
"(\\?[;&a-z\\d%_.~+=-]*)?" + // query string
"(\\#[-a-z\\d_]*)?$", "i"); // fragment locator
return !!pattern.test(str);
}
//Used to check the URL the user provided
function verifyLink(link) {
return new Promise((res, rej) => {
if (validURL(link)) {
res(link);
} else {
rej(`${link} is not a valid URL`);
}
});
}
//returns status code from link
function checkLink(link) {
return new Promise((res, rej) => {
const go = (link.startsWith("https")) ? https : http;
go.get(link, (_res) => {
res(_res.statusCode);
}).on("error", (err) => {
rej(err);
});
});
}
//ignore insecure certificate
//if its a localhost ignore certificate
function unsafe() {
if (process.argv[2] && (process.argv[2].indexOf(localhost) !== -1 || process.argv[2].indexOf("localhost") !== -1 || process.argv[2].indexOf("127.0.0.1") !== -1))
process.env["NODE_TLS_REJECT_UNAUTHORIZED"] = 0;
}
//First things first
//check the URL the user has entered
//then either throw an error or continue
async function checkArg() {
let d = null;
await verifyLink(process.argv[2]).then(async (link) => {
await checkLink(link).then((status) => {
if (status <= 302) {
let _url = url.parse(link, true);
domain = _url.protocol + "//" + _url.hostname;
d = link;
} else {
console.log("The provided URL Must Return [200] but returns [" + status + "]");
}
});
}).catch((err) => {
console.error(err + "\nPlease provide a URL like 'http://example.com'");
});
return d;
}
//removes the href | src
//ignores links with #
//removes ../ from links
function clean(urls) {
return new Promise((res) => {
let links = [];
urls.forEach(function(i) {
i = i.replace(/^(href=|src=)/g, "");
i = (i.endsWith("\"") || i.endsWith("'")) ? i.slice(0, -1) : i;
i = (i.startsWith("\"") || i.startsWith("'")) ? i.substr(1) : i;
i = (i.startsWith("../")) ? i.split("../").pop() : i;
if (!i.startsWith("#") && !i.startsWith("/#"))
links.push(i);
});
res(links);
});
}
//contains list of protocols
//angular useless links
function nonolist(_url) {
return (_url) ? !_url.substr(0, 10).match("http://|https://|mailto:|data:|tel:|ftp://|file://|news://|telnet://|gopher://|nntp://|{%") : undefined;
}
//checks each link
//if it doesn't have / at the beginning
//if it starts with hosts name it will remove it
//else if it starts with a long list of protocols
function removeUnwantedLinks(links) {
return new Promise((res) => {
let stored = [];
if (links) {
links.forEach((i) => {
if (!i.startsWith("/") && nonolist(i)) {
stored.push("/" + i);
} else if (i.startsWith(domain)) {
let fixed = i.replace(domain, "");
if (!stored.includes(fixed) || !links.includes(fixed)) {
stored.push(fixed);
}
} else if (nonolist(i)) {
stored.push(i);
}
});
}
res(stored);
});
}
//returns status code from link
function getLinks(link) {
return new Promise((res) => {
//its better to never return rejections
//just send undefined and log the error
//some servers block multiple connections....
const go = (link.startsWith("https")) ? https : http;
go.get(link, async (_res) => {
if (_res.statusCode == 200 && _res.headers["content-type"].indexOf("text/html") !== -1) {
_res.setEncoding("utf-8");
let data;
_res.on("data", (buf) => {
data += buf;
});
_res.on("end", async () => {
let _url = [];
if (data) {
//get all matching href links and src links
_url.push.apply(_url, Array.from(new Set(data.match(/href="(.*?)"/g))));
_url.push.apply(_url, Array.from(new Set(data.match(/src="(.*?)"/g))));
//remove the src and href
_url = Array.from(await clean(new Set(_url))).sort();
_url = await removeUnwantedLinks(_url);
}
res(_url);
});
} else {
res(undefined);
}
}).on("error", (err) => {
console.error(err);
res(undefined);
});
});
}
async function go(list) {
if (list) {
let visited = [];
let total;
for (let i = 0; i < list.length; i++) {
total = list.length;
let status = null;
await checkLink(domain + list[i]).then(code => {
status = code;
}).catch((err) => {
console.log(err);
});
//the decodeURIComponent does not decode everything correctly...
//but I'm too lazy to fix the missing characters so.. ...this will have to do
console.log("[\x1b[34m" + status + "\x1b[0m] " + (i + 1) + "\x1b[36m/\x1b[34m" + total + " \x1b[0m\x1b[36m" + domain + "\x1b[0m\x1b[34m" + decodeURIComponent(list[i] + "\x1b[0m"));
visited.push({
link: list[i],
status: status
});
//temporarily store current page links
//and then check if they are already visited
let tmp = await getLinks(domain + list[i]);
if (tmp) {
for (let x of tmp) {
if (!list.includes(x)) {
list.push(x);
}
}
}
}
return visited;
} else {
console.log("Couldnt find anything!");
process.exit(1);
}
}
async function main() {
//if localhost ignore certificate
unsafe();
//check if url is good
let curent_link = await checkArg();
if (curent_link) {
let visited = await go(await getLinks(curent_link));
//grab everything that doesn't have status code 200 and show it
let err = visited.filter(obj => obj.status !== 200);
if (err.length) {
console.error(`\x1b[31m[${err.length}] Sites that did not return 200 are listed here\x1b[0m`);
err.forEach((x) => {
console.error(`Link: "${domain}${x.link}" Status: ${x.status}`);
});
} else {
console.error("\x1b[31mLucky bastard!\x1b[0m");
}
}
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment