Created
March 31, 2010 14:52
-
-
Save mojavelinux/350421 to your computer and use it in GitHub Desktop.
A script to archive a user's twitter timeline incrementally
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env groovy | |
import groovyx.net.http.RESTClient | |
import groovy.xml.MarkupBuilder | |
import org.apache.commons.lang.StringEscapeUtils | |
def cli = new CliBuilder(usage: "archiveusertimeline [OPTION]") | |
cli.u(longOpt: "user", args: 1, required: true, "User to archive") | |
cli.p(longOpt: "pwd", args: 1, required: true, "Password of user") | |
def opts = cli.parse(args) | |
if (opts == null || !opts.u || !opts.p) { | |
println "Must provide a user to archive and that user's password" | |
return | |
} | |
def u = opts.u | |
def p = opts.p | |
def f = new File(System.getProperty("user.home") + "/docs/timelines/" + u + ".xml") | |
def lastId | |
def archive | |
def numStatuses = 0 | |
if (f.exists() && f.length() > 0) { | |
archive = new XmlSlurper().parseText(f.text) | |
lastId = archive.status[0].id.text() | |
def lastDate = archive.status[0].created_at.text() | |
numStatuses = archive.status[0].status_number.text().toInteger() | |
//println "Grabbing tweets since ${lastId} created at ${lastDate}" | |
} | |
def DATE_FORMAT = "EEE MMM dd HH:mm:ss Z yyyy" | |
def TWITTER_API_URL = "http://api.twitter.com" | |
def rest = new RESTClient(TWITTER_API_URL) | |
rest.auth.basic u, p | |
rest.parser.'application/xml' = { resp -> | |
return resp.entity.content | |
} | |
def page = 1 | |
def params = [count: 200] | |
if (lastId) { | |
params.since_id = lastId | |
} | |
def timeline = [] | |
def numTotal = 0 | |
def resp = null | |
while (true) { | |
params.page = page | |
if (page > 1) { | |
// rest so we don't look like a DoS attack | |
sleep 5000 | |
} | |
resp = rest.get( | |
path : '/statuses/user_timeline/' + u + '.xml', | |
query : params) | |
def statuses = new XmlSlurper().parse(resp.data) | |
def statusesResults = statuses.status.size() | |
if (statusesResults == 0) { | |
break; | |
} | |
numTotal += statusesResults | |
//println "Fetched ${statusesResults} tweets (total: ${numTotal})" | |
def statusesIt = statuses.status.iterator() | |
while (statusesIt.hasNext()) { | |
timeline += statusesIt.next() | |
} | |
page++ | |
} | |
page = 1 | |
def newRetweets = false | |
while (true) { | |
params.page = page | |
// rest so we don't look like a DoS attack | |
sleep 5000 | |
resp = rest.get( | |
path : '/statuses/retweeted_by_me.xml', | |
query : params) | |
def retweets = new XmlSlurper().parse(resp.data) | |
def retweetsResults = retweets.status.size() | |
if (retweetsResults == 0) { | |
break; | |
} | |
newRetweets = true | |
numTotal += retweetsResults | |
//println "Fetched ${retweetsResults} retweets (total: ${numTotal})" | |
def retweetsIt = retweets.status.iterator() | |
while (retweetsIt.hasNext()) { | |
timeline += retweetsIt.next() | |
} | |
page++ | |
} | |
if (newRetweets) { | |
// merge retweets with statuses by date | |
timeline.sort() { a, b -> | |
Date.parse(DATE_FORMAT, a.created_at.text()) | |
.after(Date.parse(DATE_FORMAT, b.created_at.text())) ? -1 : 1 | |
} | |
} | |
numStatuses += timeline.size() | |
def statusNumber = numStatuses | |
def writer = new FileWriter(f) | |
def xml = new MarkupBuilder(writer) | |
xml.setDoubleQuotes(true) | |
xml.mkp.xmlDeclaration(version:'1.0', encoding:'UTF-8') | |
xml.statuses(type:"array") { | |
timeline.each { e -> | |
status() { | |
created_at(e.created_at.text()) | |
id(e.id.text()) | |
status_number(statusNumber--) | |
text(StringEscapeUtils.unescapeXml(e.text.text())) | |
link(TWITTER_API_URL + "/" + u + "/status/" + e.id.text()) | |
source(e.source.text()) | |
truncated(e.truncated.text()) | |
in_reply_to_status_id(e.in_reply_to_status_id.text()) | |
in_reply_to_user_id(e.in_reply_to_user_id.text()) | |
in_reply_to_screen_name(e.in_reply_to_screen_name.text()) | |
if (e.retweeted_status.size() != 0) { | |
retweeted_status() { | |
created_at(e.retweeted_status.created_at.text()) | |
id(e.retweeted_status.id.text()) | |
text(StringEscapeUtils.unescapeXml(e.retweeted_status.text.text())) | |
link(TWITTER_API_URL + "/" + e.retweeted_status.user.screen_name + "/status/" + e.retweeted_status.id.text()) | |
source(e.retweeted_status.source.text()) | |
truncated(e.retweeted_status.truncated.text()) | |
in_reply_to_status_id(e.retweeted_status.in_reply_to_status_id.text()) | |
in_reply_to_user_id(e.retweeted_status.in_reply_to_user_id.text()) | |
in_reply_to_screen_name(e.retweeted_status.in_reply_to_screen_name.text()) | |
user() { | |
id(e.retweeted_status.user.id.text()) | |
name(e.retweeted_status.user.name.text()) | |
screen_name(e.retweeted_status.user.screen_name.text()) | |
} | |
} | |
} | |
user() { | |
id(e.user.id.text()) | |
name(e.user.name.text()) | |
screen_name(e.user.screen_name.text()) | |
} | |
geo() { | |
if (!"".equals(e.geo.text().trim())) { | |
point(e.geo.text().trim()) | |
} | |
} | |
} | |
} | |
if (archive) { | |
archive.status.each { e -> | |
status() { | |
created_at(e.created_at.text()) | |
id(e.id.text()) | |
status_number(statusNumber--) | |
text(e.text.text()) | |
link(TWITTER_API_URL + "/" + u + "/status/" + e.id.text()) | |
source(e.source.text()) | |
truncated(e.truncated.text()) | |
in_reply_to_status_id(e.in_reply_to_status_id.text()) | |
in_reply_to_user_id(e.in_reply_to_user_id.text()) | |
in_reply_to_screen_name(e.in_reply_to_screen_name.text()) | |
if (e.retweeted_status.size() != 0) { | |
retweeted_status() { | |
created_at(e.retweeted_status.created_at.text()) | |
id(e.retweeted_status.id.text()) | |
text(e.retweeted_status.text.text()) | |
link(e.retweeted_status.link.text()) | |
source(e.retweeted_status.source.text()) | |
truncated(e.retweeted_status.truncated.text()) | |
in_reply_to_status_id(e.retweeted_status.in_reply_to_status_id.text()) | |
in_reply_to_user_id(e.retweeted_status.in_reply_to_user_id.text()) | |
in_reply_to_screen_name(e.retweeted_status.in_reply_to_screen_name.text()) | |
user() { | |
id(e.retweeted_status.user.id.text()) | |
name(e.retweeted_status.user.name.text()) | |
screen_name(e.retweeted_status.user.screen_name.text()) | |
} | |
} | |
} | |
user() { | |
id(e.user.id.text()) | |
name(e.user.name.text()) | |
screen_name(e.user.screen_name.text()) | |
} | |
geo() { | |
if (e.geo['georss:point'].size() != 0) { | |
'georss:point'('xmlns:georss':'http://www.georss.org/georss') { e.geo['georss:point'].text() } | |
} | |
} | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment