Created

Embed URL

HTTPS clone URL

SSH clone URL

You can clone with HTTPS or SSH.

Download Gist

Tools for processing and visualising data from a TAGS archive

View TAGS_Stats.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
require(stringr)
require(RCurl)
require(ggplot2)
gsqAPI = function(key,query,gid=0){ return( read.csv( paste( sep="",'http://spreadsheets.google.com/tq?', 'tqx=out:csv','&tq=', curlEscape(query), '&key=', key, '&gid=', gid) ) ) }
 
trim <- function (x) sub('@','',x)
 
twCounts=function(df){
print("Counting @'d users")
to.count=data.frame(table(df$to))
colnames(to.count)=c('Name','toCount')
print('Counting senders')
from.count=data.frame(table(df$from_user))
colnames(from.count)=c('Name','fromCount')
print('Counting rtof users')
rtof.count=data.frame(table(df$rtof))
colnames(rtof.count)=c('Name','rtofCount')
print('Counting rtby users')
rtby.count=data.frame(table(df$rtby))
colnames(rtby.count)=c('Name','rtbyCount')
print('Merging datasets')
tmp=merge(rtof.count,to.count)
tmp=merge(tmp,rtby.count)
tmp=merge(tmp,from.count)
tmp$Name=factor(tmp$Name)
 
return(tmp)
}
 
 
twViz.scatter=function(df){
ggplot(na.omit(df))+geom_text(aes(x=fromCount,y=toCount,label=Name,size=rtCount,angl=45))
}
 
twArchParse=function(key,gid){
print('Getting data')
df=gsqAPI(key,'select *',gid)
print('Got data')
print('Parsing @ messages')
df$to=sapply(df$text,function(tweet) trim(str_extract(tweet,"^(@[[:alnum:]_]*)")))
print('Parsing RT: messages')
#THe str_match approach is really slow - I'm using it here rather than str_extract purely as a demo
df$rtof=sapply(df$text,function(tweet) trim(str_match(tweet,"^RT (@[[:alnum:]_]*)")[2]))
print('Parsing RT: senders')
df$rtby=paste(df$rtof,df$from_user)
df$rtby=sapply(df$rtby,function(dfx) if (word(dfx,1)=='NA') NA else word(dfx,2))
return(df)
}
 
barsorter=function (dfc){
htable= table(dfc)
hlevels=names(htable)[order(htable)]
return(factor(dfc, levels = hlevels))
}
 
twViz.scatter2=function(df,xax='fromCount',yax='toCount',zsz='rtofCount'){
ggplot(na.omit(df))+geom_text(aes_string(x=xax,y=yax,label='Name',size=zsz,angl=45))
}
 
 
#Example usage:
key='0AmbQbL4Lrd61dENiT1E4SFBTbWhzVExzTElwU0NTY3c'
gid=82
 
#ukgc2012=gsqAPI(key,'select *',gid)
 
ukgc2012.data=twArchParse(key,gid)
ukgc2012.counts=twCounts(ukgc2012.data)
 
 
#plot a bar chart of RT of counts
ggplot() + geom_bar(aes(x=na.omit(ukgc2012.data$rtof))) + opts(axis.text.x=theme_text(angle=-90,size=6)) + xlab(NULL)
 
#sorted plot based on computed counts - "RT of"
ukgc2012.data$hrt=barsorter(ukgc2012.data$rtof)
ggplot() + geom_bar(aes(x=na.omit(ukgc2012.data$hrt))) + opts(axis.text.x=theme_text(angle=-90,size=6)) + xlab(NULL)
 
 
#plot a bar chart of 'to' computed counts
ggplot() + geom_bar(aes(x=na.omit(ukgc2012.data$to))) + opts(axis.text.x=theme_text(angle=-90,size=6)) + xlab(NULL)
 
#plot a bar chart of 'from' computed counts
ggplot() + geom_bar(aes(x=na.omit(ukgc2012.data$from_user))) + opts(axis.text.x=theme_text(angle=-90,size=6)) + xlab(NULL)
 
#plot an ordered bar chart of 'from' tabulated counts
ukgc2012.counts$Name <- reorder(ukgc2012.counts$Name, ukgc2012.counts$toCount)
ggplot(ukgc2012.counts) + geom_bar(stat = "identity",aes(x=Name,y=toCount)) + opts(axis.text.x=theme_text(angle=-90,size=6)) + xlab(NULL)
 
#plot a scatterplot displaying to and from counts on x and ya axes, and label size as RT count
twViz.scatter2(ukgc2012.counts)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.