Skip to content

Instantly share code, notes, and snippets.

@grim-reapper
Forked from martincharlesworth/crawluniq.js
Created March 4, 2016 06:32
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save grim-reapper/55c92fb9ac7bc7326444 to your computer and use it in GitHub Desktop.
Save grim-reapper/55c92fb9ac7bc7326444 to your computer and use it in GitHub Desktop.
PhantomJS crawler written to detect Mixed Content
var uniqUrls = [];
var urlsToBrowse = [];
var browsedUrls = [];
function open(url, callback) {
var page = require('webpage').create();
page.settings.loadImages = true;
page.onResourceReceived = function (response) {
if (response.stage == "start" && response.url.substr(0, 4) === "http" && uniqUrls.indexOf(response.url) === -1) {
uniqUrls.push(response.url);
}
}
page.open(url, function(status) {
browsedUrls.push(url);
if (status !== "success") {
if (url === args[1]) {
console.log("Couldn't open " + url);
phantom.exit(1);
}
else {
console.log("fail " + url);
}
}
else {
var uniqHrefs = page.evaluate(function() {
var uniqHrefs = [];
var l = document.links;
for(var i=0; i<l.length; i++) {
var href = l[i].getAttribute("href");
if (href && href.length > 1 && href.charAt(0) == '/' && href.charAt(1) != '/') {
href = href.replace(/\/$/, '');
if (uniqHrefs.indexOf(href) === -1) {
uniqHrefs.push(href);
}
}
}
return uniqHrefs;
});
if (uniqHrefs) {
uniqHrefs.forEach(function(href) {
var url_without_path = url.split("/").slice(0,3).join("/");
var urlFromHref = url_without_path + href;
if (browsedUrls.indexOf(urlFromHref) === -1 && urlsToBrowse.indexOf(urlFromHref) === -1) {
urlsToBrowse.push(urlFromHref);
}
});
}
}
page.close();
callback.apply();
});
}
function crawl() {
if (urlsToBrowse.length == 0) {
uniqUrls.sort();
uniqUrls.forEach(function(url) {
console.log("uniq " + url);
});
phantom.exit(0);
}
else {
var url = urlsToBrowse.shift();
console.log("open " + url);
open(url, crawl);
}
}
var args = require('system').args;
if (args.length === 1) {
console.log('Please specify a URL.');
phantom.exit(1);
}
urlsToBrowse.push(args[1]);
crawl();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment