Skip to content

Instantly share code, notes, and snippets.

@cedriczirtacic
Last active October 26, 2017 17:52
Show Gist options
  • Save cedriczirtacic/04be05c7c67983ca387c4e6cccc26aba to your computer and use it in GitHub Desktop.
Save cedriczirtacic/04be05c7c67983ca387c4e6cccc26aba to your computer and use it in GitHub Desktop.
fetch and check robots.txt entries
#!/bin/env node
// https://gist.github.com/cedriczirtacic
const colors = require('colors');
function help(e) {
if (e != undefined || e != "")
console.info("usage: %s <url>", e);
process.exit(1);
}
function print_code(code, f) {
switch(Math.floor(code/100)) {
case 3:
console.info('[%d] %s'.yellow, code, f);
break;
case 4:
console.info('[%d] %s'.red, code, f);
break;
case 5:
console.info('[%d] %s'.blue, code, f);
break;
default:
console.info('[%d] %s'.green, code, f);
}
}
function parse_robots(data) {
var files = [];
if (data == undefined || data == "") {
console.error('[error]'.red,
"robots data is empty or invalid");
return false;
}
data.split("\n").forEach(function(d) {
if (!/\*.+$/.test(d) && (/\*$/.test(d) || d !== undefined) ) {
d = d.replace("*", "");
var f = /^(?:(?:Allow|Disallow)\:[\s\t]*)(.+)$/i.exec(d);
if (f !== null && d != '')
files[files.length] = f[1];
}
});
if (files.length == 0) {
console.error('[error]'.red,
"robots data is empty or invalid");
return false;
}
console.info('[info]'.green, "found", files.length, "dirs");
files.forEach(function(f){
const {exec} = require('child_process');
let _url = http_options['protocol'] + "//" + http_options['hostname'] + f;
exec("curl -I "+_url+" | head -n1 | cut -d' ' -f2", (err, stdout, stderr) => {
print_code(parseInt(stdout), f);
});
});
return true;
}
const argv = process.argv;
if (argv.length < 3 || argv.length > 3)
help(argv[1]);
var url = argv[2];
var proto;
try {
proto = /^(https*)/i.exec(url)[1];
}catch(e) {}
if (proto == undefined) {
console.error('[error]'.red, "invalid URL:", url);
process.exit(2);
}
// check if robots.txt is already specified in URL
if (!/robots.txt$/i.test(url)) {
if (!/\/$/.test(url))
url += "/";
url += "robots.txt";
}
console.info('[info]'.green, "using", url);
const http = require(proto);
const url_parser = require('url');
var http_options = url_parser.parse(url);
http_options.headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36"+
" (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
};
if (proto == "https") {
// don't check certs on https
http_options.rejectUnauthorized = false;
}
const req = http.get(http_options, (res) => {
let robots = "";
const {statusCode} = res;
if (statusCode > 400) {
console.error('[error]'.red,
"response status code:", statusCode);
return;
}else if (statusCode >= 300 ) { // is redirection? (3xx HTTP code)
console.error('[error]'.red,
"not following redirection to location:", res.headers.location);
return;
}
res.setEncoding('utf8');
res.on('data', (data) => {
robots += data;
});
res.on('end', () => {
parse_robots(robots);
});
}).on('error', (e) => {
console.error(e);
process.exit(3);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment