Skip to content

Instantly share code, notes, and snippets.

@mjbommar
Created February 21, 2011 05:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save mjbommar/836711 to your computer and use it in GitHub Desktop.
Save mjbommar/836711 to your computer and use it in GitHub Desktop.
Archive a twitter hashtag.
#@author Michael J Bommarito
#@contact michael.bommarito@gmail.com
#@date Feb 20, 2011
#@ip Simplified BSD, (C) 2011.
# This is a simple example of an R script that will retrieve
# public tweets from a given hashtag.
library(RJSONIO)
# This function loads stored tag data to determine the current max_id.
loadTag <- function(tag) {
# Set the filename
fileName <- sprintf("tweet_%s.csv", tag)
tweets <- read.table(file=fileName, sep="\t", header=TRUE, comment.char="", stringsAsFactors=FALSE)
return (tweets)
}
# This function downloads
downloadTag <- function(tag) {
# Set the filename
fileName <- sprintf("tweet_%s.csv", tag)
# Check to see if the file exists. If it does, load it.
if (file.exists(fileName)) {
tweets <- loadTag(tag)
maxID <- min(tweets$id)
} else {
tweets <- NULL
maxID <- 0
}
# Record the nextPage query when provided.
nextPage <- NULL
# Loop until we receive 0 results
while (1) {
if (!is.null(nextPage)) {
queryURL <- sprintf("http://search.twitter.com/search.json%s", nextPage)
} else {
if (maxID != 0) {
queryURL <- sprintf("http://search.twitter.com/search.json?q=%%23%s&rpp=100&max_id=%s", tag, maxID)
} else {
queryURL <- sprintf("http://search.twitter.com/search.json?q=%%23%s&rpp=100&", tag)
}
}
# Execute the query
response <- fromJSON(queryURL)
newTweets <- response$results
# Check to make sure that there are tweets left.
if (length(newTweets) <= 1) {
print(sprintf("No new tweets: %s %s", maxID, queryURL))
break
}
# Now check for a nextPage query.
if ("next_page" %in% names(response)) {
nextPage <- response$next_page
} else {
nextPage <- NULL
}
# These lines do not include text because no JSON libraries support
# Unicode at the moment. Therefore, it is not safe to use R
# and Twitter together on live data.
# Write out the current tweets.
dfTweets <- as.data.frame(t(sapply(newTweets, function(x) c(x$id, x$created_at, x$from_user))))
names(dfTweets) <- c("id", "date", "user")
dfTweets$id <- as.character(dfTweets$id)
dfTweets$date <- as.POSIXct(strptime(dfTweets$date, "%a, %d %b %Y %H:%M:%S %z", tz = "GMT"))
dfTweets$user <- as.character(dfTweets$user)
# Append these tweets to the list.
if (is.null(tweets)) {
tweets <- dfTweets
} else {
tweets <- rbind(tweets, dfTweets)
}
# Now update our maxID variable.
maxID <- min(tweets$id)
# Store the current set of tweets.
write.table(tweets, sep="\t", file=fileName, row.names=FALSE)
# Output some debug info and sleep to be nice to Twitter.
print(sprintf("%s, %s", maxID, dim(tweets)[1]))
flush.console()
Sys.sleep(10)
}
return (tweets)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment