Last active
November 20, 2020 13:58
-
-
Save BillGR17/f349e4d00d01116e6dc75eaa4834bda4 to your computer and use it in GitHub Desktop.
Checks website links [no external links]
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const http = require("http"), | |
https = require("https"), | |
url = require("url"), | |
ifaces = require("os").networkInterfaces(); | |
let localhost; | |
Object.keys(ifaces).forEach(dev => { | |
ifaces[dev].filter(details => { | |
if (details.family === "IPv4" && details.internal === false) { | |
localhost = details.address; | |
} | |
}); | |
}); | |
//Saves the user's URL | |
let domain; | |
//check if URL is correct | |
//its only used once for the user URL | |
//but it can be used on clean function... | |
function validURL(str) { | |
var pattern = new RegExp("^(https|http):\\/\\/" + // protocol | |
"((([a-z\\d]([a-z\\d-]*[a-z\\d])*)\\.)+[a-z]{2,}|" + // domain name | |
"((\\d{1,3}\\.){3}\\d{1,3}))" + // or IP (v4) address | |
"(\\:\\d+)?(\\/[-a-z\\d%_.~+]*)*" + // port and path | |
"(\\?[;&a-z\\d%_.~+=-]*)?" + // query string | |
"(\\#[-a-z\\d_]*)?$", "i"); // fragment locator | |
return !!pattern.test(str); | |
} | |
//Used to check the URL the user provided | |
function verifyLink(link) { | |
return new Promise((res, rej) => { | |
if (validURL(link)) { | |
res(link); | |
} else { | |
rej(`${link} is not a valid URL`); | |
} | |
}); | |
} | |
//returns status code from link | |
function checkLink(link) { | |
return new Promise((res, rej) => { | |
const go = (link.startsWith("https")) ? https : http; | |
go.get(link, (_res) => { | |
res(_res.statusCode); | |
}).on("error", (err) => { | |
rej(err); | |
}); | |
}); | |
} | |
//ignore insecure certificate | |
//if its a localhost ignore certificate | |
function unsafe() { | |
if (process.argv[2] && (process.argv[2].indexOf(localhost) !== -1 || process.argv[2].indexOf("localhost") !== -1 || process.argv[2].indexOf("127.0.0.1") !== -1)) | |
process.env["NODE_TLS_REJECT_UNAUTHORIZED"] = 0; | |
} | |
//First things first | |
//check the URL the user has entered | |
//then either throw an error or continue | |
async function checkArg() { | |
let d = null; | |
await verifyLink(process.argv[2]).then(async (link) => { | |
await checkLink(link).then((status) => { | |
if (status <= 302) { | |
let _url = url.parse(link, true); | |
domain = _url.protocol + "//" + _url.hostname; | |
d = link; | |
} else { | |
console.log("The provided URL Must Return [200] but returns [" + status + "]"); | |
} | |
}); | |
}).catch((err) => { | |
console.error(err + "\nPlease provide a URL like 'http://example.com'"); | |
}); | |
return d; | |
} | |
//removes the href | src | |
//ignores links with # | |
//removes ../ from links | |
function clean(urls) { | |
return new Promise((res) => { | |
let links = []; | |
urls.forEach(function(i) { | |
i = i.replace(/^(href=|src=)/g, ""); | |
i = (i.endsWith("\"") || i.endsWith("'")) ? i.slice(0, -1) : i; | |
i = (i.startsWith("\"") || i.startsWith("'")) ? i.substr(1) : i; | |
i = (i.startsWith("../")) ? i.split("../").pop() : i; | |
if (!i.startsWith("#") && !i.startsWith("/#")) | |
links.push(i); | |
}); | |
res(links); | |
}); | |
} | |
//contains list of protocols | |
//angular useless links | |
function nonolist(_url) { | |
return (_url) ? !_url.substr(0, 10).match("http://|https://|mailto:|data:|tel:|ftp://|file://|news://|telnet://|gopher://|nntp://|{%") : undefined; | |
} | |
//checks each link | |
//if it doesn't have / at the beginning | |
//if it starts with hosts name it will remove it | |
//else if it starts with a long list of protocols | |
function removeUnwantedLinks(links) { | |
return new Promise((res) => { | |
let stored = []; | |
if (links) { | |
links.forEach((i) => { | |
if (!i.startsWith("/") && nonolist(i)) { | |
stored.push("/" + i); | |
} else if (i.startsWith(domain)) { | |
let fixed = i.replace(domain, ""); | |
if (!stored.includes(fixed) || !links.includes(fixed)) { | |
stored.push(fixed); | |
} | |
} else if (nonolist(i)) { | |
stored.push(i); | |
} | |
}); | |
} | |
res(stored); | |
}); | |
} | |
//returns status code from link | |
function getLinks(link) { | |
return new Promise((res) => { | |
//its better to never return rejections | |
//just send undefined and log the error | |
//some servers block multiple connections.... | |
const go = (link.startsWith("https")) ? https : http; | |
go.get(link, async (_res) => { | |
if (_res.statusCode == 200 && _res.headers["content-type"].indexOf("text/html") !== -1) { | |
_res.setEncoding("utf-8"); | |
let data; | |
_res.on("data", (buf) => { | |
data += buf; | |
}); | |
_res.on("end", async () => { | |
let _url = []; | |
if (data) { | |
//get all matching href links and src links | |
_url.push.apply(_url, Array.from(new Set(data.match(/href="(.*?)"/g)))); | |
_url.push.apply(_url, Array.from(new Set(data.match(/src="(.*?)"/g)))); | |
//remove the src and href | |
_url = Array.from(await clean(new Set(_url))).sort(); | |
_url = await removeUnwantedLinks(_url); | |
} | |
res(_url); | |
}); | |
} else { | |
res(undefined); | |
} | |
}).on("error", (err) => { | |
console.error(err); | |
res(undefined); | |
}); | |
}); | |
} | |
async function go(list) { | |
if (list) { | |
let visited = []; | |
let total; | |
for (let i = 0; i < list.length; i++) { | |
total = list.length; | |
let status = null; | |
await checkLink(domain + list[i]).then(code => { | |
status = code; | |
}).catch((err) => { | |
console.log(err); | |
}); | |
//the decodeURIComponent does not decode everything correctly... | |
//but I'm too lazy to fix the missing characters so.. ...this will have to do | |
console.log("[\x1b[34m" + status + "\x1b[0m] " + (i + 1) + "\x1b[36m/\x1b[34m" + total + " \x1b[0m\x1b[36m" + domain + "\x1b[0m\x1b[34m" + decodeURIComponent(list[i] + "\x1b[0m")); | |
visited.push({ | |
link: list[i], | |
status: status | |
}); | |
//temporarily store current page links | |
//and then check if they are already visited | |
let tmp = await getLinks(domain + list[i]); | |
if (tmp) { | |
for (let x of tmp) { | |
if (!list.includes(x)) { | |
list.push(x); | |
} | |
} | |
} | |
} | |
return visited; | |
} else { | |
console.log("Couldnt find anything!"); | |
process.exit(1); | |
} | |
} | |
async function main() { | |
//if localhost ignore certificate | |
unsafe(); | |
//check if url is good | |
let curent_link = await checkArg(); | |
if (curent_link) { | |
let visited = await go(await getLinks(curent_link)); | |
//grab everything that doesn't have status code 200 and show it | |
let err = visited.filter(obj => obj.status !== 200); | |
if (err.length) { | |
console.error(`\x1b[31m[${err.length}] Sites that did not return 200 are listed here\x1b[0m`); | |
err.forEach((x) => { | |
console.error(`Link: "${domain}${x.link}" Status: ${x.status}`); | |
}); | |
} else { | |
console.error("\x1b[31mLucky bastard!\x1b[0m"); | |
} | |
} | |
} | |
main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment