public
Last active

Tools for processing and visualising data from a TAGS archive

  • Download Gist
TAGS_Stats.R
R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
require(stringr)
require(RCurl)
require(ggplot2)
gsqAPI = function(key,query,gid=0){ return( read.csv( paste( sep="",'http://spreadsheets.google.com/tq?', 'tqx=out:csv','&tq=', curlEscape(query), '&key=', key, '&gid=', gid) ) ) }
 
trim <- function (x) sub('@','',x)
 
twCounts=function(df){
print("Counting @'d users")
to.count=data.frame(table(df$to))
colnames(to.count)=c('Name','toCount')
print('Counting senders')
from.count=data.frame(table(df$from_user))
colnames(from.count)=c('Name','fromCount')
print('Counting rtof users')
rtof.count=data.frame(table(df$rtof))
colnames(rtof.count)=c('Name','rtofCount')
print('Counting rtby users')
rtby.count=data.frame(table(df$rtby))
colnames(rtby.count)=c('Name','rtbyCount')
print('Merging datasets')
tmp=merge(rtof.count,to.count)
tmp=merge(tmp,rtby.count)
tmp=merge(tmp,from.count)
tmp$Name=factor(tmp$Name)
 
return(tmp)
}
 
 
twViz.scatter=function(df){
ggplot(na.omit(df))+geom_text(aes(x=fromCount,y=toCount,label=Name,size=rtCount,angl=45))
}
 
twArchParse=function(key,gid){
print('Getting data')
df=gsqAPI(key,'select *',gid)
print('Got data')
print('Parsing @ messages')
df$to=sapply(df$text,function(tweet) trim(str_extract(tweet,"^(@[[:alnum:]_]*)")))
print('Parsing RT: messages')
#THe str_match approach is really slow - I'm using it here rather than str_extract purely as a demo
df$rtof=sapply(df$text,function(tweet) trim(str_match(tweet,"^RT (@[[:alnum:]_]*)")[2]))
print('Parsing RT: senders')
df$rtby=paste(df$rtof,df$from_user)
df$rtby=sapply(df$rtby,function(dfx) if (word(dfx,1)=='NA') NA else word(dfx,2))
return(df)
}
 
barsorter=function (dfc){
htable= table(dfc)
hlevels=names(htable)[order(htable)]
return(factor(dfc, levels = hlevels))
}
 
twViz.scatter2=function(df,xax='fromCount',yax='toCount',zsz='rtofCount'){
ggplot(na.omit(df))+geom_text(aes_string(x=xax,y=yax,label='Name',size=zsz,angl=45))
}
 
 
#Example usage:
key='0AmbQbL4Lrd61dENiT1E4SFBTbWhzVExzTElwU0NTY3c'
gid=82
 
#ukgc2012=gsqAPI(key,'select *',gid)
 
ukgc2012.data=twArchParse(key,gid)
ukgc2012.counts=twCounts(ukgc2012.data)
 
 
#plot a bar chart of RT of counts
ggplot() + geom_bar(aes(x=na.omit(ukgc2012.data$rtof))) + opts(axis.text.x=theme_text(angle=-90,size=6)) + xlab(NULL)
 
#sorted plot based on computed counts - "RT of"
ukgc2012.data$hrt=barsorter(ukgc2012.data$rtof)
ggplot() + geom_bar(aes(x=na.omit(ukgc2012.data$hrt))) + opts(axis.text.x=theme_text(angle=-90,size=6)) + xlab(NULL)
 
 
#plot a bar chart of 'to' computed counts
ggplot() + geom_bar(aes(x=na.omit(ukgc2012.data$to))) + opts(axis.text.x=theme_text(angle=-90,size=6)) + xlab(NULL)
 
#plot a bar chart of 'from' computed counts
ggplot() + geom_bar(aes(x=na.omit(ukgc2012.data$from_user))) + opts(axis.text.x=theme_text(angle=-90,size=6)) + xlab(NULL)
 
#plot an ordered bar chart of 'from' tabulated counts
ukgc2012.counts$Name <- reorder(ukgc2012.counts$Name, ukgc2012.counts$toCount)
ggplot(ukgc2012.counts) + geom_bar(stat = "identity",aes(x=Name,y=toCount)) + opts(axis.text.x=theme_text(angle=-90,size=6)) + xlab(NULL)
 
#plot a scatterplot displaying to and from counts on x and ya axes, and label size as RT count
twViz.scatter2(ukgc2012.counts)

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.