Skip to content

Instantly share code, notes, and snippets.

@mickaelandrieu
Created November 26, 2013 12:50
Show Gist options
  • Save mickaelandrieu/7657798 to your computer and use it in GitHub Desktop.
Save mickaelandrieu/7657798 to your computer and use it in GitHub Desktop.
Scrap a website with CasperJs, JSON output.
var links = [];
var quotes = [];
var tempUrl = [];
var infos = [];
var maxLinks = 10;
var firstUrl = 'http://www.imdb.com/search/title?at=0&num_votes=5000,&sort=user_rating,desc&start=1&title_type=tv_infoss';
var newUrl;
var x = require('casper').selectXPath;
var fs = require('fs');
var utils = require('utils');
var casper = require('casper').create({
verbose: true,
logLevel: 'error',
pageSettings: {
loadImages: false,
loadPlugins: false,
userAgent: 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36'
}
});
//Fonctions------------------------
function getLinks() {
var links = document.querySelectorAll('.results td.image a');
return Array.prototype.map.call(links, function(e) {
var href = e.getAttribute('href');
var url = 'http://www.imdb.com'+href+'quotes/';
return url;
});
}
function Quote(innerText) {
this.innerText = innerText;
};
casper.renderJSON = function(what) {
return this.echo(JSON.stringify(what, null, ' '));
};
casper.saveJSON = function(what) {
fs.write('json/quotes.json', JSON.stringify(what, null, ' '), 'w');
};
//Crawl------------------------
casper.start(firstUrl);
casper.then(function() {
links = this.evaluate(getLinks);
var j = 0;
this.eachThen(links,function(response){
j++;
if(j >= maxLinks) return;
this.thenOpen(response.data, function writeOnJson() {
var objectsCount = this.evaluate(function(){
return __utils__.findAll('.list .quote p').length;
});
var objects = this.evaluate(function(){
return __utils__.findAll('.list .quote p');
});
quotes = new Array();
if(objectsCount != undefined) {
for(i = 0; i < objectsCount; i++) {
if(objects[i] != null) {
var quote = new Quote(objects[i]['innerText']);
quotes.push(quote);
}
}
}
});
});
});
casper.run(function() {
this.saveJSON(quotes);
this.echo('quotes :'+quotes.length)
this.renderJSON(quotes).exit();
});
@vzR
Copy link

vzR commented Sep 23, 2016

hi, it could be that imdb has changed classes names but the script returns 0 quotes and an empty array

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment