Skip to content

Instantly share code, notes, and snippets.

@neilhawkins
Created August 12, 2014 06:37
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save neilhawkins/329e48112f78c4640130 to your computer and use it in GitHub Desktop.
Save neilhawkins/329e48112f78c4640130 to your computer and use it in GitHub Desktop.
wayback-scraper.js
var casper = require("casper").create({
verbose: true,
loglevel: "debug"
});
var response = [], url, site, viewportWidth = 1280, viewportHeight = 1024;
var captureSize = {top: 0, left: 0, width: viewportWidth, height: viewportHeight};
// to get screenshot folder name from url
var re = /url=(\w+)\./;
if (casper.cli.args.length < 1) {
casper.echo("Usage: casperjs <script name> <url>").exit();
} else {
site = casper.cli.args[0];
url = "http://web.archive.org/cdx/search/cdx?url=" + site + "&filter=statuscode:200&collapse=timestamp:4&output=json&gzip=false"
}
casper.renderJSON = function(what) {
return this.echo(JSON.stringify(what, null, ' '));
};
casper.start(function() {
this.viewport(viewportWidth, viewportHeight);
});
casper.userAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X)');
casper.then(function() {
this.open(url, {
method: "get",
headers: {
"Accept": "application/json"
}
});
});
casper.then(function() {
response = rowsToObjects(this.getPageContent());
this.renderJSON(response);
});
casper.then(function() {
this.each(response, function(casper, url, i) {
this.thenOpen(url, function() {
casper.wait(5000);
this.evaluate(function() {
document.getElementById('wm-ipp').style.display = "none";
});
this.echo('Removed Wayback Machine toolbar');
});
this.then(function () {
this.echo('Capturing screenshot for ' + url);
this.capture('screenshots/' + site + '/' + i + '.png', undefined);
this.echo('Screenshot captured.');
});
});
});
casper.run(function() {
this.exit();
});
function rowsToObjects(json) {
arr = JSON.parse(json);
var keys = arr[0];
var numRows = arr.length;
var numCols = keys.length;
var result = [];
// start at i = 1 to avoid adding row [0] as an object
for (var i = 1; i < numRows; i++) {
var obj = {};
for (var j = 0; j < numCols; j++) {
obj[ keys[j] ] = arr[i][j];
};
result.push(obj);
};
return createUrlList(result);
}
function createUrlList(result) {
var urls = [];
var baseUrl = "https://web.archive.org/web/";
for (var i = 0; i < result.length; i++) {
var timestamp = result[i].timestamp;
var original = result[i].original;
var url = baseUrl + timestamp + "/" + original;
urls.push(url);
}
return urls;
}
// open api url
// make array of urls from response
// visit each url
// render screenshot of each opened url
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment