Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Plot a time series of a hashtag where height is tweet count and color is unique user count.
# @author: Bommarito Consulting, LLC;
# @date: May 21, 2012
# @email:
# @packages: ggplot2, plyr
# Clear and import.
# Controlling parameters.
hashtag <- "#nonato" # Hashtag for label purposes
cutoff <- as.POSIXct("2012-01-11 00:00:00", tz="EDT") # First timestamp we will consider
dt <- 30 # \Delta t, minutes
# Load and pre-process tweets
tweets <- unique(read.table('data/tweets.csv', sep="\t", quote="", comment.char="",
stringsAsFactors=FALSE, header=FALSE, nrows=300000))
names(tweets) <- c("id", "date", "user", "text")
tweets$date <- as.POSIXct(strptime(tweets$date, "%a, %d %b %Y %H:%M:%S %z", tz = "GMT"))
tweets <- tweets[which(tweets$date > cutoff), ]
# Build date breaks
minDate <- min(tweets$date)
maxDate <- max(tweets$date) + 60 * dt
dateBreaks <- seq(minDate, maxDate, by=60 * dt)
# Use hist to count the number of tweets per bin; don't plot.
tweetCount <- hist(tweets$date, breaks=dateBreaks, plot=FALSE)
# Strip out the left endpoint of each bin.
binBreaks <- tweetCount$breaks[1:length(tweetCount$breaks)-1]
# Count number of unique tweeters per bin.
userCount <- sapply(binBreaks, function(d) length(unique(tweets$user[which((tweets$date >= d) & (tweets$date <= d + 60*dt))])))
# Plot data
plotData <- data.frame(dates=dateBreaks[1:length(dateBreaks)-1], tweets=as.numeric(tweetCount$count), users=as.numeric(userCount))
ggplot(plotData) +
geom_bar(aes(x=dates, y=tweets, color=users), stat="identity") +
scale_x_datetime("Date") +
scale_y_continuous("Number of tweets") +
opts(title="Number of tweets and unique users : #nonato")
ggsave("fig/ts_tweet_user.jpg", width=12, height=8)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment