Skip to content

Instantly share code, notes, and snippets.

@maxpaynestory
Created September 5, 2014 17:59
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save maxpaynestory/02563716bef271316c3d to your computer and use it in GitHub Desktop.
Save maxpaynestory/02563716bef271316c3d to your computer and use it in GitHub Desktop.
Scrape all tweets from a twitter account using casperjs and phantomjs
String.prototype.trim=function(){return this.replace(/^\s+|\s+$/g, '');}
String.prototype.ltrim=function(){return this.replace(/^\s+/,'');};
String.prototype.rtrim=function(){return this.replace(/\s+$/,'');};
String.prototype.fulltrim=function(){return this.replace(/(?:(?:^|\n)\s+|\s+(?:$|\n))/g,'').replace(/\s+/g,' ');};
Date.prototype.MMDDYYYY = function() {
var yyyy = this.getUTCFullYear().toString();
var mm = (this.getUTCMonth()+1).toString(); // getMonth() is zero-based
var dd = this.getUTCDate().toString();
return (mm[1]?mm:"0"+mm[0]) + "/" + (dd[1]?dd:"0"+dd[0]) + "/" + yyyy;
};
SanitizeString = function(str){
if(str==null){
str = '';
}
str = str.trim();
var returnstring = str.replace(/"/g,'');
returnstring = returnstring.replace(/\n/g," ");
returnstring = '"' + returnstring.fulltrim() + '"';
return returnstring;
}
var utils = require('utils');
var fs = require('fs');
var casper = require('casper').create({
verbose: true,
logLevel: 'error',
pageSettings: {
loadImages: false,
loadPlugins: false,
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
}
});
var tweet_account_name;
if(!casper.cli.has("account_name")){
casper.echo("account name is missing");
casper.echo("Use like this: casperjs.exe tweet_scrap.js --account_name=Karachi_Update");
casper.exit(1);
}
tweet_account_name = casper.cli.get("account_name");
outputfilename = "scraped_tweets.csv";
var header = "Tweet,Timetamp";
if(fs.exists(outputfilename)){
fs.remove(outputfilename);
}
var stream = fs.open(outputfilename,"w");
stream.writeLine(header);
function RecursiveTriverse(thecasper,newurl,stream)
{
thecasper.wait(1000);
thecasper.thenOpen(newurl,function(){
var timestamps = this.getElementsInfo("td.timestamp a");
var tweets = this.getElementsInfo('div.tweet-text div.dir-ltr');
this.echo("Scraping tweets from " + this.getCurrentUrl());
for( var index in tweets){
var tweet = tweets[index].text;
var time_stamp = timestamps[index].text;
tweet = SanitizeString(tweet);
time_stamp = SanitizeString(time_stamp);
stream.writeLine(tweet + "," + time_stamp);
}
if(this.exists("div.w-button-more")){
thea = this.getElementInfo("div.w-button-more a");
RecursiveTriverse(this,thea.attributes.href,stream);
}
});
return;
}
casper.start('https://mobile.twitter.com/' + tweet_account_name,function(){
RecursiveTriverse(this,this.getCurrentUrl(),stream);
});
casper.then(function(){
stream.close();
stream.flush();
});
casper.run();
Copy link

ghost commented Jan 27, 2015

Hello, I altered your scrape_tweets.js and it's at https://gist.github.com/nwaomachux/35d1c424966fccd16ae1. The one you provided entered an infinite loop.

@CAMIZOCA
Copy link

Something is not right, I get this message

PhantomJS has crashed. Please read the crash reporting guide at https://github.com/ariya/phantomjs/wiki/Crash-Reporting and file a bug report at https://github.com/ariya/phantomjs/issues/new with the crash dump file attached: /tmp/500da006-756f-95e2-76aadad4-5f7b9274.dmp
Segmentation fault

Copy link

ghost commented Aug 28, 2015

@CAMIZOCA, I have been having that issue too and it has to do with many intricacies I am not able to explain in full details but I made a work around.

I switched to v2.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment