-
-
Save n1k0/4509789 to your computer and use it in GitHub Desktop.
/** | |
* This casper scipt checks for 404 internal links for a given root url. | |
* | |
* Usage: | |
* | |
* $ casperjs 404checker.js http://mysite.tld/ | |
* $ casperjs 404checker.js http://mysite.tld/ --max-depth=42 | |
*/ | |
/*global URI*/ | |
var casper = require("casper").create({ | |
pageSettings: { | |
loadImages: false, | |
loadPlugins: false | |
} | |
}); | |
var checked = []; | |
var currentLink = 0; | |
var fs = require('fs'); | |
var upTo = ~~casper.cli.get('max-depth') || 100; | |
var url = casper.cli.get(0); | |
var baseUrl = url; | |
var links = [url]; | |
var utils = require('utils'); | |
var f = utils.format; | |
function absPath(url, base) { | |
return new URI(url).resolve(new URI(base)).toString(); | |
} | |
// Clean links | |
function cleanLinks(urls, base) { | |
return utils.unique(urls).filter(function(url) { | |
return url.indexOf(baseUrl) === 0 || !new RegExp('^(#|ftp|javascript|http)').test(url); | |
}).map(function(url) { | |
return absPath(url, base); | |
}).filter(function(url) { | |
return checked.indexOf(url) === -1; | |
}); | |
} | |
// Opens the page, perform tests and fetch next links | |
function crawl(link) { | |
this.start().then(function() { | |
this.echo(link, 'COMMENT'); | |
this.open(link); | |
checked.push(link); | |
}); | |
this.then(function() { | |
if (this.currentHTTPStatus === 404) { | |
this.warn(link + ' is missing (HTTP 404)'); | |
} else if (this.currentHTTPStatus === 500) { | |
this.warn(link + ' is broken (HTTP 500)'); | |
} else { | |
this.echo(link + f(' is okay (HTTP %s)', this.currentHTTPStatus)); | |
} | |
}); | |
this.then(function() { | |
var newLinks = searchLinks.call(this); | |
links = links.concat(newLinks).filter(function(url) { | |
return checked.indexOf(url) === -1; | |
}); | |
this.echo(newLinks.length + " new links found on " + link); | |
}); | |
} | |
// Fetch all <a> elements from the page and return | |
// the ones which contains a href starting with 'http://' | |
function searchLinks() { | |
return cleanLinks(this.evaluate(function _fetchInternalLinks() { | |
return [].map.call(__utils__.findAll('a[href]'), function(node) { | |
return node.getAttribute('href'); | |
}); | |
}), this.getCurrentUrl()); | |
} | |
// As long as it has a next link, and is under the maximum limit, will keep running | |
function check() { | |
if (links[currentLink] && currentLink < upTo) { | |
crawl.call(this, links[currentLink]); | |
currentLink++; | |
this.run(check); | |
} else { | |
this.echo("All done, " + checked.length + " links checked."); | |
this.exit(); | |
} | |
} | |
if (!url) { | |
casper.warn('No url passed, aborting.').exit(); | |
} | |
casper.start('https://js-uri.googlecode.com/svn/trunk/lib/URI.js', function() { | |
var scriptCode = this.getPageContent() + '; return URI;'; | |
window.URI = new Function(scriptCode)(); | |
if (typeof window.URI === "function") { | |
this.echo('URI.js loaded'); | |
} else { | |
this.warn('Could not setup URI.js').exit(); | |
} | |
}); | |
casper.run(process); | |
function process() { | |
casper.start().then(function() { | |
this.echo("Starting"); | |
}).run(check); | |
} |
As a side note, this example will not work with PhantomJS 1.9.0 (in fact, everything testing the HTTP status code will be wrong):
[dalexandre:~/Desktop] $ casperjs 404checker.js http://phantomjs.org/coucou-I-do-not-exist
URI.js loaded
Starting
http://phantomjs.org/coucou-I-do-not-exist
http://phantomjs.org/coucou-I-do-not-exist is okay (HTTP 200)
1 new links found on http://phantomjs.org/coucou-I-do-not-exist
All done, 1 links checked.
[dalexandre:~/Desktop] $ phantomjs -v
1.9.0
[dalexandre:~/Desktop] $ casperjs --version
1.0.2
Hello!
Awesome script very helpfully!
I'm the casperjs noob. I tried to add some casperJS test there to get xunit report for this test I mean
function crawl(link) {
this.start().then(function() {
this.echo(link, 'COMMENT');
this.open(link);
checked.push(link);
});
this.then(function() {
test.assertHttpStatus(200);
});
this.then(function() {
var newLinks = searchLinks.call(this);
links = links.concat(newLinks).filter(function(url) {
return checked.indexOf(url) === -1;
});
this.echo(newLinks.length + " new links found on " + link);
});
}
And wrap all code in test.begin(); and didn't works. Script making tests correctly but didn't generate final Report and xunit file.
1: thank you.
2: is there a way to limit crawl to local URL's only? E.g. when the script reaches my Google Analytics code it disappears into checking many Google links, slowing things up.
Alternatively, can I exclude links? I've tried to implement my own checks but my limited JS knowledge is stopping me.
Does this script stay on the same domain? --same-domain=true
would be handy if not. --max-depth
also threw me off at first, but still an awesome script. Thanks
+1 singerxt . How do you get this to dump an xunit file?
For now, I modified the check() function to write the dead links to a file, and can fail a jenkins run if the file exists:
this.echo("All done, " + checked.length + " links checked.");
if (fs.exists("deadLinks.log")) fs.remove("deadLinks.log");
if (dead.length > 0) {
this.warn("Found " + dead.length + " dead links: ");
for (var i in dead){
this.warn(dead[i]);
}
fs.write("deadLinks.log", dead.join("\n"));
}
this.exit();
I modified (quickly... there could be a cleaner of doing this, but I just looked into casperjs today) this script to be executed as a unit test and can output xunit files.
See this gist: https://gist.github.com/jnankin/66829583bbcd9d16bb61
Sorry for bad English. Your code does not consider the presence of the tag <base> in the <head>
Sorry my bad code i am junior
//My version of the function given tag base
function searchLinks() {
return cleanLinks(this.evaluate(function _fetchInternalLinks() {
var base = "";
if (__utils__.exists('base')){
base = __utils__.findOne('base').getAttribute('href');
}
return [].map.call(__utils__.findAll('a[href]'), function(node) {
var link = node.getAttribute('href');
if (!new RegExp('^(#|ftp|javascript|http|/)').test(link)){
return base + link;
}else{
return link;
}
});
}), this.getCurrentUrl());
}
I don't think it works right when a site requires certificates.
just an FYI, looks like https://js-uri.googlecode.com/svn/trunk/lib/URI.js is 404'ing now 😢
@shuynh you are correct:
$ curl -I https://js-uri.googlecode.com/svn/trunk/lib/URI.js
HTTP/1.1 404 Not Found
This code block will not work now:
casper.start('https://js-uri.googlecode.com/svn/trunk/lib/URI.js', function() {
var scriptCode = this.getPageContent() + '; return URI;';
window.URI = new Function(scriptCode)();
if (typeof window.URI === "function") {
this.echo('URI.js loaded');
} else {
this.warn('Could not setup URI.js').exit();
}
});
I am unable to find a suitable mirror unfortunately.
granze : is it an improvement ? or messy code ?