require(stringr) |
require(RCurl) |
require(ggplot2) |
gsqAPI = function(key,query,gid=0){ return( read.csv( paste( sep="",'http://spreadsheets.google.com/tq?', 'tqx=out:csv','&tq=', curlEscape(query), '&key=', key, '&gid=', gid) ) ) } |
trim <- function (x) sub('@','',x) |
twCounts=function(df){ |
print("Counting @'d users") |
to.count=data.frame(table(df$to)) |
colnames(to.count)=c('Name','toCount') |
print('Counting senders') |
from.count=data.frame(table(df$from_user)) |
colnames(from.count)=c('Name','fromCount') |
print('Counting rtof users') |
rtof.count=data.frame(table(df$rtof)) |
colnames(rtof.count)=c('Name','rtofCount') |
print('Counting rtby users') |
rtby.count=data.frame(table(df$rtby)) |
colnames(rtby.count)=c('Name','rtbyCount') |
print('Merging datasets') |
tmp=merge(rtof.count,to.count) |
tmp=merge(tmp,rtby.count) |
tmp=merge(tmp,from.count) |
tmp$Name=factor(tmp$Name) |
return(tmp) |
} |
twViz.scatter=function(df){ |
ggplot(na.omit(df))+geom_text(aes(x=fromCount,y=toCount,label=Name,size=rtCount,angl=45)) |
} |
twArchParse=function(key,gid){ |
print('Getting data') |
df=gsqAPI(key,'select *',gid) |
print('Got data') |
print('Parsing @ messages') |
df$to=sapply(df$text,function(tweet) trim(str_extract(tweet,"^(@[[:alnum:]_]*)"))) |
print('Parsing RT: messages') |
#THe str_match approach is really slow - I'm using it here rather than str_extract purely as a demo |
df$rtof=sapply(df$text,function(tweet) trim(str_match(tweet,"^RT (@[[:alnum:]_]*)")[2])) |
print('Parsing RT: senders') |
df$rtby=paste(df$rtof,df$from_user) |
df$rtby=sapply(df$rtby,function(dfx) if (word(dfx,1)=='NA') NA else word(dfx,2)) |
return(df) |
} |
barsorter=function (dfc){ |
htable= table(dfc) |
hlevels=names(htable)[order(htable)] |
return(factor(dfc, levels = hlevels)) |
} |
twViz.scatter2=function(df,xax='fromCount',yax='toCount',zsz='rtofCount'){ |
ggplot(na.omit(df))+geom_text(aes_string(x=xax,y=yax,label='Name',size=zsz,angl=45)) |
} |
#Example usage: |
key='0AmbQbL4Lrd61dENiT1E4SFBTbWhzVExzTElwU0NTY3c' |
gid=82 |
#ukgc2012=gsqAPI(key,'select *',gid) |
ukgc2012.data=twArchParse(key,gid) |
ukgc2012.counts=twCounts(ukgc2012.data) |
#plot a bar chart of RT of counts |
ggplot() + geom_bar(aes(x=na.omit(ukgc2012.data$rtof))) + opts(axis.text.x=theme_text(angle=-90,size=6)) + xlab(NULL) |
#sorted plot based on computed counts - "RT of" |
ukgc2012.data$hrt=barsorter(ukgc2012.data$rtof) |
ggplot() + geom_bar(aes(x=na.omit(ukgc2012.data$hrt))) + opts(axis.text.x=theme_text(angle=-90,size=6)) + xlab(NULL) |
#plot a bar chart of 'to' computed counts |
ggplot() + geom_bar(aes(x=na.omit(ukgc2012.data$to))) + opts(axis.text.x=theme_text(angle=-90,size=6)) + xlab(NULL) |
#plot a bar chart of 'from' computed counts |
ggplot() + geom_bar(aes(x=na.omit(ukgc2012.data$from_user))) + opts(axis.text.x=theme_text(angle=-90,size=6)) + xlab(NULL) |
#plot an ordered bar chart of 'from' tabulated counts |
ukgc2012.counts$Name <- reorder(ukgc2012.counts$Name, ukgc2012.counts$toCount) |
ggplot(ukgc2012.counts) + geom_bar(stat = "identity",aes(x=Name,y=toCount)) + opts(axis.text.x=theme_text(angle=-90,size=6)) + xlab(NULL) |
#plot a scatterplot displaying to and from counts on x and ya axes, and label size as RT count |
twViz.scatter2(ukgc2012.counts) |