Created
May 14, 2013 20:49
-
-
Save stephenturner/5579421 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Most of this code was adapted near-verbatim from Neil's post about ISMB 2012. | |
## http://nsaunders.wordpress.com/2012/08/16/twitter-coverage-of-the-ismb-2012-meeting-some-statistics/ | |
## Modify this. This is where I keep this repo. | |
repoDir <- ("~/code/twitterchive/") | |
## Go to the analysis directory | |
setwd(paste(repoDir, "analysis", sep="")) | |
## Function needs better documentation | |
twitterchivePlots <- function (filename=NULL) { | |
## Load required packages | |
require(tm) | |
require(wordcloud) | |
require(RColorBrewer) | |
if (class(filename)!="character") stop("filename must be character") | |
if (!file.exists(filename)) stop(paste("File does not exist:", filename)) | |
searchTerm <- sub("\\.txt", "", basename(filename)) | |
message(paste("Filename:", filename)) | |
message(paste("Search Term: ", searchTerm)) | |
## Read in the data and munge around the dates. | |
## I can't promise the fixed widths will always work out for you. | |
message("Reading in data.") | |
trim.whitespace <- function(x) gsub("^\\s+|\\s+$", "", x) # Function to trim leading and trailing whitespace from character vectors. | |
d <- read.fwf(filename, widths=c(18, 14, 18, 1000), stringsAsFactors=FALSE, comment.char="") | |
d <- as.data.frame(sapply(d, trim.whitespace)) | |
names(d) <- c("id", "datetime", "user", "text") | |
d$user <- sub("@", "", d$user) | |
d$datetime <- as.POSIXlt(d$datetime, format="%b %d %H:%M") | |
d$date <- as.Date(d$datetime) | |
d$hour <- d$datetime$hour | |
d <- na.omit(d) # CRs cause a problem. explain this later. | |
head(d) | |
## Number of tweets by date for the last n days | |
recentDays <- 30 | |
message(paste("Plotting number of tweets by date in the last", recentDays, "days.")) | |
recent <- subset(d, date>=(max(date)-recentDays)) | |
byDate <- as.data.frame(table(recent$date)) | |
names(byDate) <- c("date", "tweets") | |
png(paste(searchTerm, "barplot-tweets-by-date.png", sep="--"), w=1000, h=700) | |
par(mar=c(8.5,4,4,1)) | |
with(byDate, barplot(tweets, names=date, col="black", las=2, cex.names=1.2, cex.axis=1.2, mar=c(10,4,4,1), main=paste("Number of Tweets by Date", paste("Term:", searchTerm), sep="\n"))) | |
dev.off() | |
# ggplot(byDate) + geom_bar(aes(date, tweets), stat="identity", fill="black") + theme_bw() + ggtitle("Number of Tweets by Date") + theme(axis.text.x=element_text(angle=90, hjust=1)) | |
## Number of tweets by hour | |
message("Plotting number of tweets by hour.") | |
byHour <- as.data.frame(table(d$hour)) | |
names(byHour) <- c("hour", "tweets") | |
png(paste(searchTerm, "barplot-tweets-by-hour.png", sep="--"), w=1000, h=700) | |
with(byHour, barplot(tweets, names.arg=hour, col="black", las=1, cex.names=1.2, cex.axis=1.2, main=paste("Number of Tweets by Hour", paste("Term:", searchTerm), paste("Date:", Sys.Date()), sep="\n"))) | |
dev.off() | |
# ggplot(byHour) + geom_bar(aes(hour, tweets), stat="identity", fill="black") + theme_bw() + ggtitle("Number of Tweets by Hour") | |
## Barplot of top 20 hashtags | |
message("Plotting top 20 hashtags.") | |
words <- unlist(strsplit(d$text, " ")) | |
head(table(words)) | |
ht <- words[grep("^#", words)] | |
ht <- tolower(ht) | |
ht <- gsub("[^A-Za-z0-9]", "", ht) # remove anything not starting with a letter or number | |
ht <- as.data.frame(table(ht)) | |
ht <- subset(ht, ht!="") # remove blanks | |
ht <- ht[sort.list(ht$Freq, decreasing=TRUE), ] | |
ht <- ht[-1, ] # remove the term you're searching for? it usually dominates the results. | |
ht <- head(ht, 20) | |
head(ht) | |
png(paste(searchTerm, "barplot-top-hashtags.png", sep="--"), w=1000, h=700) | |
par(mar=c(5,10,4,2)) | |
with(ht[order(ht$Freq), ], barplot(Freq, names=ht, horiz=T, col="black", las=1, cex.names=1.2, cex.axis=1.2, main=paste("Number of Tweets by Hour", paste("Term:", searchTerm), paste("Date:", Sys.Date()), sep="\n"))) | |
dev.off() | |
# ggplot(ht) + geom_bar(aes(ht, Freq), fill = "black", stat="identity") + coord_flip() + theme_bw() + ggtitle("Top hashtags") | |
## Top Users | |
message("Plotting most prolific users.") | |
users <- as.data.frame(table(d$user)) | |
colnames(users) <- c("user", "tweets") | |
users <- users[order(users$tweets, decreasing=T), ] | |
users <- subset(users, user!=searchTerm) | |
users <- head(users, 20) | |
head(users) | |
png(paste(searchTerm, "barplot-top-users.png", sep="--"), w=1000, h=700) | |
par(mar=c(5,10,4,2)) | |
with(users[order(users$tweets), ], barplot(tweets, names=user, horiz=T, col="black", las=1, cex.names=1.2, cex.axis=1.2, main=paste("Most prolific users", paste("Term:", searchTerm), paste("Date:", Sys.Date()), sep="\n"))) | |
dev.off() | |
## Word clouds | |
message("Plotting a wordcloud.") | |
words <- unlist(strsplit(d$text, " ")) | |
words <- grep("^[A-Za-z0-9]+$", words, value=T) | |
words <- tolower(words) | |
words <- words[-grep("^[rm]t$", words)] # remove "RT" | |
words <- words[!(words %in% stopwords("en"))] # remove stop words | |
words <- words[!(words %in% c("mt", "rt", "via", "using", 1:9))] # remove RTs, MTs, via, and single digits. | |
wordstable <- as.data.frame(table(words)) | |
wordstable <- wordstable[order(wordstable$Freq, decreasing=T), ] | |
wordstable <- wordstable[-1, ] # remove the hashtag you're searching for? need to functionalize this. | |
head(wordstable) | |
png(paste(searchTerm, "wordcloud.png", sep="--"), w=800, h=800) | |
wordcloud(wordstable$words, wordstable$Freq, scale = c(8, .2), min.freq = 3, max.words = 200, random.order = FALSE, rot.per = .15, colors = brewer.pal(8, "Dark2")) | |
#mtext(paste(paste("Term:", searchTerm), paste("Date:", Sys.Date()), sep=";"), cex=1.5) | |
dev.off() | |
message(paste(searchTerm, ": All done!\n")) | |
} | |
filelist <- list("../bioinformatics.txt", "../metagenomics.txt", "../rstats.txt", "../rna-seq.txt") | |
lapply(filelist, twitterchivePlots) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment