Created
November 30, 2018 06:28
-
-
Save iczero/d20b5457131344d6fc61717e42c72fa3 to your computer and use it in GitHub Desktop.
Crappy function that gets URL title. Will fix later probably
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Passes most of the tests on http://ircbot.science/ | |
// needs a URL regex put in the URL_REGEX variable. | |
// use this one if you want: https://gist.github.com/iczero/513afbc94291735a0d94a5a6d0be3827 | |
/** | |
* Get title of webpage by url | |
* @param {String} qurl URL in question | |
* @param {Function} callback Callback when page title is found | |
* @param {Number} num Number of redirects encountered (usually 0) | |
* @return {void} | |
*/ | |
function getUrlTitle(qurl, callback, num = 0) { | |
if (num >= redirnum) return callback('Redirect loop'); | |
let vurl = url.parse(qurl); | |
if (!vurl.protocol) { // its a domain name, try it as http | |
qurl = 'http://' + qurl; | |
vurl = url.parse(qurl); | |
} | |
let urlOpts = { | |
host: vurl.hostname, | |
path: vurl.path, | |
headers: { | |
'User-Agent': `Mozilla/5.0 (compatible, otherbot url title resolver) Node.js/${process.version}` | |
}, | |
family | |
}; | |
let ssl; | |
if (vurl.protocol == 'http:') { | |
urlOpts.port = 80; | |
} else if (vurl.protocol == 'https:') { | |
urlOpts.port = 443; | |
ssl = true; | |
} else { | |
// this can happen with plain hosts with ports | |
return getUrlTitle('http://' + qurl, callback, num); | |
} | |
if (vurl.port) urlOpts.port = vurl.port; | |
let req; | |
let reqtimeout = setTimeout(() => { | |
if (!req.aborted) { | |
req.abort(); | |
callback('Request took too long'); | |
} | |
}, timeout); | |
let errorHandler = function(err) { | |
if (req.aborted) return; | |
console.error(err); | |
req.abort(); | |
clearTimeout(reqtimeout); | |
if (err.code) { | |
switch (err.code) { | |
// case 'ECONNRESET': | |
case 'ENOTFOUND': | |
return; | |
} | |
callback(`Connection error: ${err.code}`); // probably not necessary | |
} | |
}; | |
let redirectTo = function(toURL) { | |
req.abort(); | |
try { | |
let parsed = new url.URL(toURL, qurl); | |
getUrlTitle(parsed.href, callback, ++num); | |
} catch (e) { | |
req.abort(); | |
} | |
}; | |
let handler = function(response) { | |
if (~redircodes.indexOf(response.statusCode)) return redirectTo(response.headers.location); | |
let desc = ''; | |
// ffs why are content-type and content-length OPTIONAL | |
if (response.headers['content-type']) { | |
if (response.headers['content-length']) { | |
desc = `${response.statusCode} Content-Type: ${response.headers['content-type']}, Length: ${prettysize(response.headers['content-length'])}`; | |
} else { | |
desc = `${response.statusCode} Content-Type: ${response.headers['content-type']}`; | |
} | |
} else if (response.headers['content-length']) { | |
desc = `${response.statusCode} Length: ${prettysize(response.headers['content-length'])}`; | |
} else desc = `${response.statusCode} No data found`; | |
if (response.headers['content-length'] && response.headers['content-length'] > MAX_LENGTH) { | |
req.abort(); | |
return callback(desc); | |
} else if (response.headers['content-type'] && response.headers['content-type'].split(';')[0] !== 'text/html') { | |
req.abort(); | |
return callback(desc); | |
} | |
let data = ''; | |
response.on('data', function(chunk) { | |
data += chunk; | |
if (response.headers['content-length'] && data.length > response.headers['content-length']) { | |
req.abort(); | |
return callback('Malformed response (mismatched Content-Length)'); | |
} | |
if (data.length > MAX_LENGTH) req.abort(); // feed this into cheerio | |
}).on('end', () => { | |
clearTimeout(reqtimeout); | |
let $ = cheerio.load(data, { | |
decodeEntities: false | |
}); | |
let redirect = $('meta[http-equiv="refresh"]').first().attr('content'); | |
if (redirect) return redirectTo((redirect.match(/url=(.+)$/) || {})[1]); | |
let title = $('title').first().text(); | |
if (title) callback(`${response.statusCode === 200 ? '' : response.statusCode + ' '}Title: ` + entities.decode(title).replace(/\r|\n|\0/g, '').match(/^\s*(.*)\s*$/)[1]); | |
else callback(desc); | |
}).on('error', errorHandler); // we don't care | |
}; | |
try { | |
if (ssl) { | |
req = https.get(urlOpts, handler).on('error', errorHandler); | |
} else { | |
req = http.get(urlOpts, handler).on('error', errorHandler); | |
} | |
} catch (err) { | |
clearTimeout(reqtimeout); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment