Skip to content

Instantly share code, notes, and snippets.

@maxpaynestory
Created August 27, 2014 16:14
Show Gist options
  • Save maxpaynestory/ff2676630978bf120634 to your computer and use it in GitHub Desktop.
Save maxpaynestory/ff2676630978bf120634 to your computer and use it in GitHub Desktop.
Scrape imdb movie listing to csv file with images using casperjs and phantomjs
String.prototype.trim=function(){return this.replace(/^\s+|\s+$/g, '');}
String.prototype.ltrim=function(){return this.replace(/^\s+/,'');};
String.prototype.rtrim=function(){return this.replace(/\s+$/,'');};
String.prototype.fulltrim=function(){return this.replace(/(?:(?:^|\n)\s+|\s+(?:$|\n))/g,'').replace(/\s+/g,' ');};
convertToSlug = function(Text)
{
return Text
.toLowerCase()
.replace(/[^\w ]+/g,'')
.replace(/ +/g,'-')
;
}
SanitizeString = function(str){
if(str==null){
str = '';
}
str = str.trim();
var returnstring = str.replace(/"/g,'');
returnstring = returnstring.replace(/\n/g," ");
returnstring = '"' + returnstring.fulltrim() + '"';
return returnstring;
}
indexOf = function(needle) {
if(typeof Array.prototype.indexOf === 'function') {
indexOf = Array.prototype.indexOf;
} else {
indexOf = function(needle) {
var i = -1, index = -1;
for(i = 0; i < this.length; i++) {
if(this[i] === needle) {
index = i;
break;
}
}
return index;
};
}
return indexOf.call(this, needle);
};
ScrapeImdbMovieInfo = function(c,stream,imagefolder,scraped_imdb_movies){
if(c.exists("td#overview-top h1.header span.itemprop")){
var name = c.getElementInfo("td#overview-top h1.header span.itemprop").text;
if(indexOf.call(scraped_imdb_movies, name.toLowerCase())>-1){
return;
}
scraped_imdb_movies.push(name.toLowerCase());
c.echo("Getting info for " + name);
var year = c.getElementInfo("td#overview-top h1.header span.nobr a").text;
var slug = convertToSlug(name + " " + year);
var inside = c.evaluate(function(){
var inside = {};
inside.cast = $("div#titleCast table.cast_list tr td.itemprop a span.itemprop").map(function () {return $(this).text();}).get().join(',');
inside.genres = $("span[itemprop='genre']").map(function () {return $(this).text();}).get().join(',');
return inside;
});
var cast = inside.cast;
var genres = inside.genres;
var description = c.getElementInfo("td#overview-top p[itemprop='description']").text;
var source = c.getCurrentUrl();
var rating = '0';
if(c.exists("td#overview-top div.titlePageSprite.star-box-giga-star")){
rating = c.getElementInfo("td#overview-top div.titlePageSprite.star-box-giga-star").text;
}
var imagename = "";
if(c.exists("td#img_primary div a img")){
imagename = slug + ".jpg";
c.download(c.getElementInfo("td#img_primary div a img").attributes.src,imagefolder + "/" + imagename);
}
name = SanitizeString(name);
year = SanitizeString(year);
cast = SanitizeString(cast);
description = SanitizeString(description);
rating = SanitizeString(rating);
genres = SanitizeString(genres);
stream.writeLine(name + "," + year + "," + slug + "," + cast + "," + description + "," + source + "," + rating + "," + imagename + "," + genres);
}
}
var casper = require('casper').create({
verbose: true,
logLevel: 'error',
pageSettings: {
loadImages: false,
loadPlugins: false,
webSecurityEnabled:false,
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
}
});
var utils = require('utils');
var fs = require('fs');
var screenshot = "tasveer.png";
var outputfilename = "myimdb_data.csv";
var imagefolder = "imdb_images";
var scraped_imdb_movies = [];
var header = "Name,Year,Slug,Cast,Description,Source,Rating,Imagename.Genres";
var movie_links = [];
if(fs.exists(screenshot)){
fs.remove(screenshot);
}
if(fs.exists(outputfilename)){
fs.remove(outputfilename);
}
if(!fs.exists(imagefolder)){
fs.makeDirectory(imagefolder);
}else{
var filenames = fs.list(imagefolder);
for(var fileindex=0;fileindex<filenames.length;fileindex++){
if(fs.isFile(imagefolder + '/' + filenames[fileindex])){
fs.remove(imagefolder + '/' + filenames[fileindex]);
}
}
}
var stream = fs.open(outputfilename,"w");
casper.start('http://www.imdb.com/movies-in-theaters/',function(){
if(!this.exists("div#main h1.header")){
this.echo("Unable to open imdb Movies in theaters");
this.exit(1);
}
var list_items = this.getElementsInfo("div.list_item[itemtype='http://schema.org/Movie'] table tbody tr td:nth-child(2) h4 a");
this.echo("Picked " + list_items.length + " items");
for(var itemindex=0;itemindex<list_items.length;itemindex++)
{
movie_links.push("http://www.imdb.com" + list_items[itemindex].attributes.href);
}
});
casper.thenOpen("http://www.imdb.com/search/title?at=0&genres=action&sort=user_rating&title_type=feature",function(){
this.repeat(40,function(){
var list_items = this.getElementsInfo("div#main table.results tbody tr td.image a");
this.echo("Picked " + list_items.length + " items");
for(var itemindex=0;itemindex<list_items.length;itemindex++)
{
movie_links.push("http://www.imdb.com" + list_items[itemindex].attributes.href);
}
this.thenClick("div#main div.leftright:nth-child(1) div#right span.pagination a:last-child");
});
});
casper.then(function(){
stream.writeLine(header);
for(var m=0;m<movie_links.length;m++){
this.thenOpen(movie_links[m],function(){
ScrapeImdbMovieInfo(this,stream,imagefolder,scraped_imdb_movies)
});
}
});
casper.then(function(){
stream.close();
stream.flush();
});
casper.run();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment