public
Created — forked from robhawkes/twitter.js

Twitter scraper in Node

  • Download Gist
twitter.js
JavaScript
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
// Reorder, rename, and document variables at some point
var sys = require("sys"),
twitter = require("ntwitter"),
mongoose = require("mongoose"),
db_user,
db_pass,
db_url,
db_port,
db_name,
coll,
Schema = mongoose.Schema,
ObjectId = Schema.ObjectId,
TweetsSchema,
tweetsModel,
lastTweetId,
page,
twit;
 
// Define database settings
db_user = "CHANGME";
db_pass = "CHANGME";
db_url = "CHANGME";
db_port = 27017; // Change if required
db_name = "CHANGME";
coll = "tweets";
// Set up database schema for tweets
TweetsSchema = new Schema({
_id: ObjectId,
tweet: {}
});
mongoose.model("TweetsSchema", TweetsSchema);
tweetsModel = mongoose.model("TweetsSchema", coll);
// Connect to the database
mongoose.connect("mongodb://db_user:db_pass@db_url:db_port/"+db_name);
 
// Set up Twitter connection
twit = new twitter({
consumer_key: "CHANGME",
consumer_secret: "CHANGME",
access_token_key: "CHANGME",
access_token_secret: "CHANGME"
});
 
// ID of last tweet
lastTweetId = 0;
 
// Start on the first page of results
page = 1;
 
// Find last tweet added to the database
tweetsModel.find({}, {"tweet.id_str": 1}).sort("tweet.id", "descending").limit(1).run(function (err, result) {
if (err != null) {
sys.puts(err);
return;
};
var tweet = result[0].tweet;
lastTweetId = Number(tweet.id_str);
retrieveOldTweets(lastTweetId, page);
});
 
 
// Catch up on previous tweets using the REST Search
function retrieveOldTweets(lastTweetId, page) {
twit.search("html5", {"rpp": 100, "result_type": "recent", "since_id": lastTweetId, "page": page}, function(err, data) {
if (err != null) {
sys.puts(err);
return;
};
if (!data.results.length || data.results.length == 0) {
sys.puts("No more search results");
// Once caught up use the Streaming API for live tracking
startStream();
return;
};
var i, results = data.results;
for (i = 0; i < results.length; i++) {
tweet = results[i];
addTweet(tweet);
};
retrieveOldTweets(lastTweetId, ++page);
});
};
 
// Grab tweets from the Streaming API
function startStream() {
// This should probably grab a few previous tweets to make sure the transition is seamless
twit.stream("statuses/filter", {track:"html5"}, function(stream) {
stream.on("data", function (tweet) {
addTweet(tweet);
});
stream.on("error", function (error) {
sys.puts(sys.inspect(error));
});
stream.on("end", function (end) {
sys.puts(sys.inspect(end));
});
});
};
 
// Add tweet to the database
function addTweet(tweet) {
if (!tweet.id) {
sys.puts("Tweet id is not present");
return;
};
// Count existing tweets with the same id
tweetsModel.count({"tweet.id": tweet.id}, function(err, count) {
if (err != null) {
sys.puts(err);
return;
};
if (count > 0) {
sys.puts("Tweet already exists with id "+tweet.id);
return;
};
if (count == 0) {
var t = new tweetsModel();
t.tweet = tweet;
t.save(function(err) {
if (err != null) {
sys.puts(err);
return;
};
 
sys.puts("Added tweet with id "+tweet.id);
});
};
});
};

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.