Skip to content

Instantly share code, notes, and snippets.

@iczero
Created November 30, 2018 06:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iczero/d20b5457131344d6fc61717e42c72fa3 to your computer and use it in GitHub Desktop.
Save iczero/d20b5457131344d6fc61717e42c72fa3 to your computer and use it in GitHub Desktop.
Crappy function that gets URL title. Will fix later probably
// Passes most of the tests on http://ircbot.science/
// needs a URL regex put in the URL_REGEX variable.
// use this one if you want: https://gist.github.com/iczero/513afbc94291735a0d94a5a6d0be3827
/**
* Get title of webpage by url
* @param {String} qurl URL in question
* @param {Function} callback Callback when page title is found
* @param {Number} num Number of redirects encountered (usually 0)
* @return {void}
*/
function getUrlTitle(qurl, callback, num = 0) {
if (num >= redirnum) return callback('Redirect loop');
let vurl = url.parse(qurl);
if (!vurl.protocol) { // its a domain name, try it as http
qurl = 'http://' + qurl;
vurl = url.parse(qurl);
}
let urlOpts = {
host: vurl.hostname,
path: vurl.path,
headers: {
'User-Agent': `Mozilla/5.0 (compatible, otherbot url title resolver) Node.js/${process.version}`
},
family
};
let ssl;
if (vurl.protocol == 'http:') {
urlOpts.port = 80;
} else if (vurl.protocol == 'https:') {
urlOpts.port = 443;
ssl = true;
} else {
// this can happen with plain hosts with ports
return getUrlTitle('http://' + qurl, callback, num);
}
if (vurl.port) urlOpts.port = vurl.port;
let req;
let reqtimeout = setTimeout(() => {
if (!req.aborted) {
req.abort();
callback('Request took too long');
}
}, timeout);
let errorHandler = function(err) {
if (req.aborted) return;
console.error(err);
req.abort();
clearTimeout(reqtimeout);
if (err.code) {
switch (err.code) {
// case 'ECONNRESET':
case 'ENOTFOUND':
return;
}
callback(`Connection error: ${err.code}`); // probably not necessary
}
};
let redirectTo = function(toURL) {
req.abort();
try {
let parsed = new url.URL(toURL, qurl);
getUrlTitle(parsed.href, callback, ++num);
} catch (e) {
req.abort();
}
};
let handler = function(response) {
if (~redircodes.indexOf(response.statusCode)) return redirectTo(response.headers.location);
let desc = '';
// ffs why are content-type and content-length OPTIONAL
if (response.headers['content-type']) {
if (response.headers['content-length']) {
desc = `${response.statusCode} Content-Type: ${response.headers['content-type']}, Length: ${prettysize(response.headers['content-length'])}`;
} else {
desc = `${response.statusCode} Content-Type: ${response.headers['content-type']}`;
}
} else if (response.headers['content-length']) {
desc = `${response.statusCode} Length: ${prettysize(response.headers['content-length'])}`;
} else desc = `${response.statusCode} No data found`;
if (response.headers['content-length'] && response.headers['content-length'] > MAX_LENGTH) {
req.abort();
return callback(desc);
} else if (response.headers['content-type'] && response.headers['content-type'].split(';')[0] !== 'text/html') {
req.abort();
return callback(desc);
}
let data = '';
response.on('data', function(chunk) {
data += chunk;
if (response.headers['content-length'] && data.length > response.headers['content-length']) {
req.abort();
return callback('Malformed response (mismatched Content-Length)');
}
if (data.length > MAX_LENGTH) req.abort(); // feed this into cheerio
}).on('end', () => {
clearTimeout(reqtimeout);
let $ = cheerio.load(data, {
decodeEntities: false
});
let redirect = $('meta[http-equiv="refresh"]').first().attr('content');
if (redirect) return redirectTo((redirect.match(/url=(.+)$/) || {})[1]);
let title = $('title').first().text();
if (title) callback(`${response.statusCode === 200 ? '' : response.statusCode + ' '}Title: ` + entities.decode(title).replace(/\r|\n|\0/g, '').match(/^\s*(.*)\s*$/)[1]);
else callback(desc);
}).on('error', errorHandler); // we don't care
};
try {
if (ssl) {
req = https.get(urlOpts, handler).on('error', errorHandler);
} else {
req = http.get(urlOpts, handler).on('error', errorHandler);
}
} catch (err) {
clearTimeout(reqtimeout);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment