Skip to content

Instantly share code, notes, and snippets.

@meefen
Forked from psychemedia/TAGS_Stats.R
Last active December 14, 2015 03:58
Show Gist options
  • Save meefen/5024366 to your computer and use it in GitHub Desktop.
Save meefen/5024366 to your computer and use it in GitHub Desktop.
#------------------------
# Load libraries
require(stringr)
require(RCurl)
require(ggplot2)
#------------------------
# Read and return Google Spreadsheet as csv
# The file should first be published on Google Drive
# Also note that Google Drive uses https, and the old
# function does not work anymore
gsqAPI <- function(key, gid=0) {
url <- paste(sep="", 'https://docs.google.com/spreadsheet/pub?key=', key,
'&single=true&gid=', gid, '&output=csv')
conn <- textConnection(getURL(url))
archive <- read.csv(conn, stringsAsFactors=FALSE)
close(conn)
return(archive)
}
# Trim . and @
# Note that some reply messages starts with ".@user"
trim <- function (x) {
sub('(\\.)?@', '', x)
}
# Count stats for users
twCounts <- function(df) {
print("Counting @'d users")
to.count <- data.frame(table(df$to))
colnames(to.count) <- c('Name','toCount')
print('Counting senders')
from.count <- data.frame(table(df$from_user))
colnames(from.count) <- c('Name','fromCount')
print('Counting rtof users')
rtof.count <- data.frame(table(df$rtof))
colnames(rtof.count) <- c('Name','rtofCount')
print('Counting rtby users')
rtby.count <- data.frame(table(df$rtby))
colnames(rtby.count) <- c('Name','rtbyCount')
print('Merging datasets')
counts <- merge(rtof.count,to.count,by="Name",all.x=TRUE)
counts <- merge(counts,rtby.count,all.x=TRUE)
counts <- merge(counts,from.count,all.x=TRUE)
counts[is.na(counts)] <- 0
counts$Name <- factor(counts$Name)
return(counts)
}
# Parse tweet archive
twArchParse <- function(key,gid){
print('Getting data')
df <- gsqAPI(key, gid)
print('Got data')
print('Parsing @ messages')
df$to <- sapply(df$text,function(tweet) trim(str_extract(tweet,"^((\\.)?(@[[:alnum:]_]*))")))
print('Parsing RT: messages')
#THe str_match approach is really slow - I'm using it here rather than str_extract purely as a demo
df$rtof <- sapply(df$text,function(tweet) trim(str_match(tweet,"^[MR]T (@[[:alnum:]_]*)")[2]))
print('Parsing RT: senders')
df$rtby <- paste(df$rtof,df$from_user)
df$rtby <- sapply(df$rtby,function(dfx) if (word(dfx,1)=='NA') NA else word(dfx,2))
return(df)
}
# Sort data for bar plot
barsorter <- function (dfc){
htable <- table(dfc)
hlevels <- names(htable)[order(-htable)]
return(factor(dfc, levels = hlevels))
}
#------------------------
# Example usage: #ODDTO13 archive
key <- '0Aup6zwZoYbZ1dEZBeG83bTNlOXpxQVFDSklNQ2RjTEE'
gid <- 82
# read and parse data
archive.data <- twArchParse(key, gid)
archive.data$id_str <- as.character(archive.data$id_str)
archive.data$from_user <- as.factor(archive.data$from_user)
archive.data$from_user_id_str <- as.character(archive.data$from_user_id_str)
archive.data$time <- as.POSIXlt(archive.data$time, tz = "GMT", format = "%d/%m/%Y %H:%M:%S")
archive.data$in_reply_to_user_id_str <- as.character(archive.data$in_reply_to_user_id_str)
archive.data$in_reply_to_screen_name <- as.factor(archive.data$in_reply_to_screen_name)
archive.data$in_reply_to_status_id_str <- as.character(archive.data$in_reply_to_status_id_str)
# compute user stats
archive.counts <- twCounts(archive.data)
# plot a bar chart of RT of counts
ggplot() +
geom_bar(aes(x=na.omit(archive.data$rtof))) +
theme(axis.text.x=element_text(angle=-90,size=9)) +
xlab("Users")
# sorted plot based on computed counts - "RT of"
archive.data$hrt <- barsorter(archive.data$rtof)
ggplot() + geom_bar(aes(x=na.omit(archive.data$hrt))) +
theme(axis.text.x=element_text(angle=-90,size=9)) +
xlab("Users")
# plot a bar chart of 'to' computed counts
ggplot() + geom_bar(aes(x=na.omit(archive.data$to))) +
theme(axis.text.x=element_text(angle=-90,size=9)) +
xlab("Users")
# plot a bar chart of 'from' computed counts
ggplot() + geom_bar(aes(x=na.omit(archive.data$from_user))) +
theme(axis.text.x=element_text(angle=-90,size=6)) +
xlab("Users")
# plot an ordered bar chart of 'from' tabulated counts
archive.counts$Name <- reorder(archive.counts$Name, archive.counts$toCount)
ggplot(archive.counts) + geom_bar(stat = "identity",aes(x=Name,y=toCount)) +
theme(axis.text.x=element_text(angle=-90,size=9)) +
xlab("Users")
# plot a scatterplot displaying to and from counts on x and ya axes, and label size as RT count
ggplot(na.omit(archive.counts)) +
geom_point(aes(x=fromCount, y=toCount, size=10)) +
geom_text(aes(x=fromCount, y=toCount, label=Name, size=rtofCount, angle=45))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment