Skip to content

Instantly share code, notes, and snippets.

@meefen
Last active December 30, 2015 21:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save meefen/7885000 to your computer and use it in GitHub Desktop.
Save meefen/7885000 to your computer and use it in GitHub Desktop.
This gist contains Java code for archiving tweets in MongoDB, and R code for retrieving tweets for analysis. Please note that there're a couple of dependencies: (1) You should setup your MongoDB (I used the free plan at https://mongolab.com/); (2) You should have setup your Twitter api (check https://dev.twitter.com/); (3) Java code: include twi…
library(rmongodb)
## Host info and credentials
host <- "ds053858.mongolab.com:53858"
username <- "your_username"
password <- "your_pass"
db <- "your_db"
## Connect to mongodb
mongo <- mongo.create(host=host, db=db,
username=username, password=password)
## Get a list of collections within our namespace
# here I used each collection for a twitter archive
mongo.get.database.collections(mongo, db)
## Create a string that points to the namespace
# the collection I'm interested in is "#mri13"
collection <- "#mri13"
namespace <- paste(db, collection, sep=".")
## Check the total number of tweets in "#mri13"
mongo.count(mongo, namespace, mongo.bson.empty())
## Build a query to find how many tweets were posted by me
buf <- mongo.bson.buffer.create()
mongo.bson.buffer.append(buf, "user_name", "bodongchen")
query <- mongo.bson.from.buffer(buf)
# get the count
count <- mongo.count(mongo, namespace, query)
count
## Get all tweets posted by me
tweets <- list()
cursor <- mongo.find(mongo, namespace, query)
while (mongo.cursor.next(cursor)) {
val <- mongo.cursor.value(cursor)
tweets[[length(tweets)+1]] <- mongo.bson.value(val, "tweet_text")
}
length(tweets)
## Retrieve all tweets and put into a dataframe
library(plyr)
df_arch1 = data.frame(stringsAsFactors = FALSE)
cursor <- mongo.find(mongo, namespace)
while (mongo.cursor.next(cursor)) {
# iterate and grab the next record
tmp = mongo.bson.to.list(mongo.cursor.value(cursor))
# make it a dataframe
tmp.df = as.data.frame(t(unlist(tmp)), stringsAsFactors = F)
# bind to the master dataframe
df_arch1 = rbind.fill(df_arch1, tmp.df)
}
dim(df_arch1)
### Try with another collection
collection2 <- "#edtechchat"
namespace2 <- paste(db, collection2, sep=".")
mongo.count(mongo, namespace2, mongo.bson.empty())
package twitter_loop;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.Mongo;
import com.mongodb.MongoException;
import java.net.UnknownHostException;
import java.util.List;
import java.util.Scanner;
import twitter4j.Query;
import twitter4j.QueryResult;
import twitter4j.Status;
import twitter4j.Twitter;
import twitter4j.TwitterException;
import twitter4j.TwitterFactory;
import twitter4j.UserMentionEntity;
import twitter4j.conf.ConfigurationBuilder;
public class Twitter_loop {
/**
* Settings before running the program
*/
// Mongodb info
private String host = "ds053858.mongolab.com";
private int port = 53858;
private String db_name = "your_db";
private String username = "your_username";
private char[] password = "your_password".toCharArray();
// Twitter api
private String consumerKey = "yourkey";
private String consumerSecret = "yoursecret";
private String accessToken = "yourtoken";
private String accessTokenSecret = "yourtokensecret";
// Time interval
private int seconds = 60;
// Number of tweets to get each time
private int count = 100;
private ConfigurationBuilder cb;
private DB db;
private DBCollection items;
/**
* static block used to construct a connection with tweeter with twitter4j
* configuration with provided settings. This configuration builder will be
* used for next search action to fetch the tweets from twitter.com.
*/
public static void main(String[] args) throws InterruptedException {
Twitter_loop taskObj = new Twitter_loop();
taskObj.loadMenu();
}
public void loadMenu() throws InterruptedException {
System.out.print("Please choose your Keyword:\t");
Scanner input = new Scanner(System.in);
String keyword = input.nextLine();
connectdb(keyword);
int i = 0;
while (i < 1) {
cb = new ConfigurationBuilder();
cb.setDebugEnabled(true);
cb.setOAuthConsumerKey(consumerKey);
cb.setOAuthConsumerSecret(consumerSecret);
cb.setOAuthAccessToken(accessToken);
cb.setOAuthAccessTokenSecret(accessTokenSecret);
getTweetByQuery(true, keyword);
cb = null;
Thread.sleep(seconds * 1000); // wait
}
}
public void connectdb(String keyword) {
try {
// on constructor load initialize MongoDB and load collection
initMongoDB();
items = db.getCollection(keyword);
//make the tweet_ID unique in the database
BasicDBObject index = new BasicDBObject("tweet_ID", 1);
items.ensureIndex(index, new BasicDBObject("unique", true));
} catch (MongoException ex) {
System.out.println("MongoException :" + ex.getMessage());
}
}
/**
* initMongoDB been called in constructor so every object creation this
* initialize MongoDB.
*/
public void initMongoDB() throws MongoException {
try {
System.out.println("Connecting to Mongo DB..");
Mongo mongo;
// mongo = new Mongo("127.0.0.1");
// db = mongo.getDB("tweetDB2");
mongo = new Mongo(host, port);
db = mongo.getDB(db_name);
db.authenticate(username, password);
} catch (UnknownHostException ex) {
System.out.println("MongoDB Connection Error :" + ex.getMessage());
}
}
public void getTweetByQuery(boolean loadRecords, String keyword) throws InterruptedException {
TwitterFactory tf = new TwitterFactory(cb.build());
Twitter twitter = tf.getInstance();
if (cb != null) {
try {
Query query = new Query(keyword);
query.setCount(count);
QueryResult result;
result = twitter.search(query);
System.out.println("Getting Tweets...");
List<Status> tweets = result.getTweets();
for (Status tweet : tweets) {
BasicDBObject basicObj = new BasicDBObject();
basicObj.put("user_name", tweet.getUser().getScreenName());
basicObj.put("retweet_count", tweet.getRetweetCount());
basicObj.put("tweet_followers_count", tweet.getUser().getFollowersCount());
basicObj.put("source", tweet.getSource());
//basicObj.put("coordinates",tweet.getGeoLocation());
UserMentionEntity[] mentioned = tweet.getUserMentionEntities();
basicObj.put("tweet_mentioned_count", mentioned.length);
basicObj.put("tweet_ID", tweet.getId());
basicObj.put("tweet_text", tweet.getText());
try {
items.insert(basicObj);
} catch (Exception e) {
System.out.println("MongoDB Connection Error : " + e.getMessage());
}
}
// Printing fetched records from DB.
if (loadRecords) {
getTweetsRecords();
}
} catch (TwitterException te) {
System.out.println("te.getErrorCode() " + te.getErrorCode());
System.out.println("te.getExceptionCode() " + te.getExceptionCode());
System.out.println("te.getStatusCode() " + te.getStatusCode());
if (te.getStatusCode() == 401) {
System.out.println("Twitter Error : \nAuthentication credentials (https://dev.twitter.com/pages/auth) were missing or incorrect.\nEnsure that you have set valid consumer key/secret, access token/secret, and the system clock is in sync.");
} else {
System.out.println("Twitter Error : " + te.getMessage());
}
}
} else {
System.out.println("MongoDB is not Connected! Please check mongoDB intance running..");
}
}
public void getTweetsRecords() throws InterruptedException {
BasicDBObject fields = new BasicDBObject("_id", true).append("user_name", true).append("tweet_text", true);
DBCursor cursor = items.find(new BasicDBObject(), fields);
while (cursor.hasNext()) {
System.out.println(cursor.next());
}
}
}
@SONEINT
Copy link

SONEINT commented Mar 22, 2014

Dear dirkchen, I'm currently working on the same code to transfer my data collections of tweets from my MongoDB databases to R to deals with text mining & social network analysis.
I have not understand the conversion of your collection with the function paste () which gave me a NULL result for me.
Anyway, I have a limitation with rmongodb and plyr packages on large tweets collection data sets, and things for me don't want to work properly. I have posted a stack overflow here : http://stackoverflow.com/questions/22445419/transfer-large-mongodb-collections-to-data-frame-in-r-with-rmongodb-and-plyr
Any help would be merely appreciated.

Cyrille

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment