Skip to content

Instantly share code, notes, and snippets.

@leodutra
Last active May 23, 2017 16:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save leodutra/3cd39249315b37da7dba to your computer and use it in GitHub Desktop.
Save leodutra/3cd39249315b37da7dba to your computer and use it in GitHub Desktop.
Simple JavaScript Cralwer (js, crawler, javascript)
/*
JavaScript Link Crawler
author: Leonardo Dutra (leodutra.br@gmail.com)
Instructions: open browser console, paste, run.
*/
var limit = 30000; // limite de links encontrados
var linkHolder = {}; // objeto usado como hash de links
var SCPReferenciado = [];
var visitedDomains = [];
var success = found = executionTime = 0;
var currentDomain = null;
var stopped = false;
function getLinks(html, origin) {
html = removeComments(html);
// regex que encontra links href, src e do JavaScript
var matches = html.match(/(?:\b(?:href|src)[^=]*=|["'>])[^"'<>]+?\.(?:html|php|asp|do|jsp|htm)\b[^"'<>\s]*/gim)
if (matches) {
var i = matches.length;
while (i--) {
// remove espaços, quebra de linha e quotes dos links encontrados
matches[i] = matches[i].replace(/["\s'>\r\n]+|^(?:href|src)[^=]*=/gim, '');
}
return relativeToAbsolute(matches, origin);
}
return null;
}
// remove comentarios JS e HTML
function removeComments(html) {
// HTML <!--[\S\s]*?-->
// JS /**/ \/\*(?:[^*] | \*[^/])*?\*\/
// JS // ([^:])\/\/.*
return html.replace(/(?:<!-[^>]*>|\/\*(?:[^*]|\*[^\/])*?\*\/)/gm, '')
.replace(/([^:])\/\/.*/gm, '$1'); // TODO IMPROVE "//" in case of not a actual comment
}
// obtém uma simulação do objeto location com a uri definida
function getLocationInfo(uri) {
var a = document.createElement('a');
a.href = uri;
return a;
}
// extrai domínio de uma uri
function getDomain(uri) {
return uri.match(/^\w+:\/\/[^\/]+/)[0];
}
// tranforma caminhos relativos em uri completas
function relativeToAbsolute(links, origin) {
var absURI = getAbsolutePath(origin) + '/';
var domain = getDomain(origin) + '/';
var i = links.length;
var link;
while (i--) {
link = links[i];
if (link.search(/^\w+:\/\//i) === -1) {
link = (link.charAt(0) === '/' ? domain : absURI) + link;
}
links[i] = link.replace(/([^:])\/\/+/g, '$1/');
}
return links;
}
// function backTrackURI(uri) {
// return uri.replace(/(?:http[s]?:\/)?\/*?[^\/]+?\/?$/im, '');
// }
// obtem caminho absoluto de uma uri
function getAbsolutePath(uri) {
return uri.match(/^\w+:\/\/[^\/]+\/*(?:[^\/\.]+(?:\/+|\r)|\.\.\/)*/)[0];
}
// retorna link nao visitado seguindo prioridade de dominio
function getUnvisitedURI() {
for (var i = 0, l = visitedDomains.length, visitedDomain; i < l; ++i) {
visitedDomain = visitedDomains[i];
for (var link in linkHolder) {
if (~link.indexOf(visitedDomain) && linkHolder[link].status === '_') {
return link;
}
}
}
if (currentDomain) {
for (var link in linkHolder) {
if (~link.indexOf(currentDomain) && linkHolder[link].status === '_') return link;
}
visitedDomains.push(currentDomain);
}
for (var link in linkHolder) {
if (linkHolder[link].status === '_') {
currentDomain = getLocationInfo(link).hostname;
return link;
}
}
return null;
}
// definicao de classe de informacao de link para melhor perfomance
function LinkInfo(origin) { this.origin = origin; };
LinkInfo.prototype = {
status: '_',
origin: ''
};
// adiciona links ao hash de controle
function pushLinks(links, origin) {
if (links) {
var i = links.length;
var link;
while (i--) {
if (linkHolder[links[i]]) continue;
linkHolder[links[i]] = new LinkInfo(origin);
++found;
}
}
}
function getExecutionTime() {
return new Date(Date.now() - executionTime).toISOString().match(/([^T]*)Z$/)[1];
}
function toLink(href) {
return '<a href="' + href + '" target="_blank">' + href + '</a>';
}
// mostra status simplificado e retorna links por categoria
function status() {
var visited = [];
var broken = [];
var unvisited = [];
var redirected = [];
for (var link in linkHolder) {
switch (linkHolder[link].status) {
case '_':
type = unvisited;
break;
case 'V':
type = visited;
break;
case 'X':
type = broken;
break;
case 'R':
type = redirected;
break;
}
type.push(linkHolder[link].status + ' ' + toLink(link) +'<span>&nbsp;'+ toLink(linkHolder[link].origin)+'</span>');
}
console.log([
'Execution time: ' + getExecutionTime(),
found + ' found',
unvisited.length + ' unvisited',
visited.length + ' visited',
redirected.length + ' redirected',
broken.length + ' broken'
].join('\n'));
return {
broken: broken,
visited: visited,
unvisited: unvisited,
redirected: redirected
};
}
// exibe links em popup para impressao
function showLinks() {
var data = status();
var br = '<br/>';
var logInfo = [
'Execution time: ' + getExecutionTime(),
found + ' found',
data.unvisited.length + ' unvisited',
data.visited.length + ' visited',
data.redirected.length + ' redirected',
data.broken.length + ' broken',
br,
'### BROKEN: ' + data.broken.length,
data.broken.sort().join(br),
br,
'### REDIRECTED: ' + data.redirected.length,
data.redirected.sort().join(br),
br,
'### VISITED: ' + data.visited.length,
data.visited.sort().join(br),
br,
'### UNVISITED: ' + data.unvisited.length,
data.unvisited.sort().join(br)
];
var popup = open(null, '_blank');
if (popup) {
popup.document.write(
'<head><style>a {color: #555;text-decoration: none;} span a {color: #bbb;}</style></head>'+
'<body>'+
'<div style="white-space:nowrap;font-size: 12px; font-family: Consolas,\'Lucida Console\',\'DejaVu Sans Mono\',monospace;">' +
logInfo.join(br) +
'</pre></div></body>'
);
}
else {
alert('Popup bloqueado.')
}
}
// visita determinado link e extrai outros links (crawl)
function visitLink(link) {
if (link) {
jQuery.ajax({
url: link
//,xhrFields: {
// withCredentials: true
//}
}).fail(function (jqXHR, textStatus, errorThrown) {
linkHolder[link].status = 'X';
run()
}).done(function (data, textStatus, jqXHR) {
++success;
linkHolder[link].status = 'V';
if (typeof data==='string') pushLinks(getLinks(data, link), link);
run();
});
}
else console.log('FINISHED (no more links to crawl)');
}
function run() {
if (success < limit && !stopped) visitLink(getUnvisitedURI());
else {
stopped = true;
showLinks();
limit = (success / limit >> 0) * limit + limit;
}
}
function start(limit) {
stopped = false;
console.log('RUNNING...\nUse status() and showLinks()');
executionTime = Date.now();
run();
}
// importa jQuery necessaria para o crawler
function importScript(src) {
var el = document.createElement('script');
el.type = 'text/javascript';
el.src = src;
(document.head || document.body).appendChild(el);
}
// START
importScript('//ajax.googleapis.com/ajax/libs/jquery/1.8.1/jquery.min.js');
pushLinks([location.href]);
setTimeout(start, 2000); // 2 seg de aguardo pelo import
//'limit:'+limit;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment