Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
A CasperJS script to check for 404 & 500 internal links on a given website
/**
* This casper scipt checks for 404 internal links for a given root url.
*
* Usage:
*
* $ casperjs 404checker.js http://mysite.tld/
* $ casperjs 404checker.js http://mysite.tld/ --max-depth=42
*/
/*global URI*/
var casper = require("casper").create({
pageSettings: {
loadImages: false,
loadPlugins: false
}
});
var checked = [];
var currentLink = 0;
var fs = require('fs');
var upTo = ~~casper.cli.get('max-depth') || 100;
var url = casper.cli.get(0);
var baseUrl = url;
var links = [url];
var utils = require('utils');
var f = utils.format;
function absPath(url, base) {
return new URI(url).resolve(new URI(base)).toString();
}
// Clean links
function cleanLinks(urls, base) {
return utils.unique(urls).filter(function(url) {
return url.indexOf(baseUrl) === 0 || !new RegExp('^(#|ftp|javascript|http)').test(url);
}).map(function(url) {
return absPath(url, base);
}).filter(function(url) {
return checked.indexOf(url) === -1;
});
}
// Opens the page, perform tests and fetch next links
function crawl(link) {
this.start().then(function() {
this.echo(link, 'COMMENT');
this.open(link);
checked.push(link);
});
this.then(function() {
if (this.currentHTTPStatus === 404) {
this.warn(link + ' is missing (HTTP 404)');
} else if (this.currentHTTPStatus === 500) {
this.warn(link + ' is broken (HTTP 500)');
} else {
this.echo(link + f(' is okay (HTTP %s)', this.currentHTTPStatus));
}
});
this.then(function() {
var newLinks = searchLinks.call(this);
links = links.concat(newLinks).filter(function(url) {
return checked.indexOf(url) === -1;
});
this.echo(newLinks.length + " new links found on " + link);
});
}
// Fetch all <a> elements from the page and return
// the ones which contains a href starting with 'http://'
function searchLinks() {
return cleanLinks(this.evaluate(function _fetchInternalLinks() {
return [].map.call(__utils__.findAll('a[href]'), function(node) {
return node.getAttribute('href');
});
}), this.getCurrentUrl());
}
// As long as it has a next link, and is under the maximum limit, will keep running
function check() {
if (links[currentLink] && currentLink < upTo) {
crawl.call(this, links[currentLink]);
currentLink++;
this.run(check);
} else {
this.echo("All done, " + checked.length + " links checked.");
this.exit();
}
}
if (!url) {
casper.warn('No url passed, aborting.').exit();
}
casper.start('https://js-uri.googlecode.com/svn/trunk/lib/URI.js', function() {
var scriptCode = this.getPageContent() + '; return URI;';
window.URI = new Function(scriptCode)();
if (typeof window.URI === "function") {
this.echo('URI.js loaded');
} else {
this.warn('Could not setup URI.js').exit();
}
});
casper.run(process);
function process() {
casper.start().then(function() {
this.echo("Starting");
}).run(check);
}

Granze commented Jan 11, 2013

Thank you very much for the script. This is exactly what I was looking for.

A small improvement can be the one var declaration at the top like this:

var casper = require("casper").create({
      pageSettings: {
        loadImages: false,
        loadPlugins: false
      }
    }),
    checked = [],
    currentLink = 0,
    fs = require('fs'),
    upTo = ~~casper.cli.get('max-depth') || 100,
    url = casper.cli.get(0),
    baseUrl = url,
    links = [url],
    utils = require('utils'),
    f = utils.format;

thanks!

Hi,

Im using your script to test some of my websites and I just realized that IMO --max-depth isnt working as expected, because of:

var upTo = ~~casper.cli.get('max-depth') || 100;

Its very confusing, because max-depth in the crawler terminology is a depth of links it goes into a website, not the number of links checked. I think http://www.xml-sitemaps.com/forum/index.php?topic=1623 pretty much covers the topic of depth.

Anyway thanks for the tool :)

fedir commented Jul 2, 2013

pavelloz +1 : max-depth != number of links to check

fedir commented Jul 2, 2013

granze : is it an improvement ? or messy code ?

As a side note, this example will not work with PhantomJS 1.9.0 (in fact, everything testing the HTTP status code will be wrong):

[dalexandre:~/Desktop] $ casperjs 404checker.js http://phantomjs.org/coucou-I-do-not-exist
URI.js loaded
Starting
http://phantomjs.org/coucou-I-do-not-exist
http://phantomjs.org/coucou-I-do-not-exist is okay (HTTP 200)
1 new links found on http://phantomjs.org/coucou-I-do-not-exist
All done, 1 links checked.
[dalexandre:~/Desktop] $ phantomjs -v
1.9.0
[dalexandre:~/Desktop] $ casperjs --version
1.0.2

Hello!

Awesome script very helpfully!
I'm the casperjs noob. I tried to add some casperJS test there to get xunit report for this test I mean

function crawl(link) {
    this.start().then(function() {
        this.echo(link, 'COMMENT');
        this.open(link);
        checked.push(link);
    });
    this.then(function() {
            test.assertHttpStatus(200);
    });
    this.then(function() {
        var newLinks = searchLinks.call(this);
        links = links.concat(newLinks).filter(function(url) {
            return checked.indexOf(url) === -1;
        });
        this.echo(newLinks.length + " new links found on " + link);
    });
}

And wrap all code in test.begin(); and didn't works. Script making tests correctly but didn't generate final Report and xunit file.

1: thank you.

2: is there a way to limit crawl to local URL's only? E.g. when the script reaches my Google Analytics code it disappears into checking many Google links, slowing things up.

Alternatively, can I exclude links? I've tried to implement my own checks but my limited JS knowledge is stopping me.

wsams commented Mar 21, 2014

Does this script stay on the same domain? --same-domain=true would be handy if not. --max-depth also threw me off at first, but still an awesome script. Thanks

jnankin commented Aug 14, 2014

+1 singerxt . How do you get this to dump an xunit file?

jnankin commented Aug 14, 2014

For now, I modified the check() function to write the dead links to a file, and can fail a jenkins run if the file exists:

this.echo("All done, " + checked.length + " links checked.");
if (fs.exists("deadLinks.log")) fs.remove("deadLinks.log");

if (dead.length > 0) {
   this.warn("Found " + dead.length + " dead links: ");
   for (var i in dead){
      this.warn(dead[i]);
   }

   fs.write("deadLinks.log", dead.join("\n"));
}

this.exit();

jnankin commented Aug 15, 2014

I modified (quickly... there could be a cleaner of doing this, but I just looked into casperjs today) this script to be executed as a unit test and can output xunit files.

See this gist: https://gist.github.com/jnankin/66829583bbcd9d16bb61

Sorry for bad English. Your code does not consider the presence of the tag <base> in the <head>

Sorry my bad code i am junior

//My version of the function given tag base
function searchLinks() {
    return cleanLinks(this.evaluate(function _fetchInternalLinks() {
        var base = "";
        if (__utils__.exists('base')){
            base = __utils__.findOne('base').getAttribute('href');
        }
        return [].map.call(__utils__.findAll('a[href]'), function(node) {
            var link = node.getAttribute('href');
            if (!new RegExp('^(#|ftp|javascript|http|/)').test(link)){
                return base + link;
            }else{
                return link;
            }
        });
    }), this.getCurrentUrl());
}

I don't think it works right when a site requires certificates.

shuynh commented May 11, 2016

just an FYI, looks like https://js-uri.googlecode.com/svn/trunk/lib/URI.js is 404'ing now 😢

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment