Skip to content

Instantly share code, notes, and snippets.

@psychemedia
Created December 22, 2011 18:01
Show Gist options
  • Save psychemedia/1511220 to your computer and use it in GitHub Desktop.
Save psychemedia/1511220 to your computer and use it in GitHub Desktop.
Tools for processing and visualising data from a TAGS archive
require(stringr)
require(RCurl)
require(ggplot2)
gsqAPI = function(key,query,gid=0){ return( read.csv( paste( sep="",'http://spreadsheets.google.com/tq?', 'tqx=out:csv','&tq=', curlEscape(query), '&key=', key, '&gid=', gid) ) ) }
trim <- function (x) sub('@','',x)
twCounts=function(df){
print("Counting @'d users")
to.count=data.frame(table(df$to))
colnames(to.count)=c('Name','toCount')
print('Counting senders')
from.count=data.frame(table(df$from_user))
colnames(from.count)=c('Name','fromCount')
print('Counting rtof users')
rtof.count=data.frame(table(df$rtof))
colnames(rtof.count)=c('Name','rtofCount')
print('Counting rtby users')
rtby.count=data.frame(table(df$rtby))
colnames(rtby.count)=c('Name','rtbyCount')
print('Merging datasets')
tmp=merge(rtof.count,to.count)
tmp=merge(tmp,rtby.count)
tmp=merge(tmp,from.count)
tmp$Name=factor(tmp$Name)
return(tmp)
}
twViz.scatter=function(df){
ggplot(na.omit(df))+geom_text(aes(x=fromCount,y=toCount,label=Name,size=rtCount,angl=45))
}
twArchParse=function(key,gid){
print('Getting data')
df=gsqAPI(key,'select *',gid)
print('Got data')
print('Parsing @ messages')
df$to=sapply(df$text,function(tweet) trim(str_extract(tweet,"^(@[[:alnum:]_]*)")))
print('Parsing RT: messages')
#THe str_match approach is really slow - I'm using it here rather than str_extract purely as a demo
df$rtof=sapply(df$text,function(tweet) trim(str_match(tweet,"^RT (@[[:alnum:]_]*)")[2]))
print('Parsing RT: senders')
df$rtby=paste(df$rtof,df$from_user)
df$rtby=sapply(df$rtby,function(dfx) if (word(dfx,1)=='NA') NA else word(dfx,2))
return(df)
}
barsorter=function (dfc){
htable= table(dfc)
hlevels=names(htable)[order(htable)]
return(factor(dfc, levels = hlevels))
}
twViz.scatter2=function(df,xax='fromCount',yax='toCount',zsz='rtofCount'){
ggplot(na.omit(df))+geom_text(aes_string(x=xax,y=yax,label='Name',size=zsz,angl=45))
}
#Example usage:
key='0AmbQbL4Lrd61dENiT1E4SFBTbWhzVExzTElwU0NTY3c'
gid=82
#ukgc2012=gsqAPI(key,'select *',gid)
ukgc2012.data=twArchParse(key,gid)
ukgc2012.counts=twCounts(ukgc2012.data)
#plot a bar chart of RT of counts
ggplot() + geom_bar(aes(x=na.omit(ukgc2012.data$rtof))) + opts(axis.text.x=theme_text(angle=-90,size=6)) + xlab(NULL)
#sorted plot based on computed counts - "RT of"
ukgc2012.data$hrt=barsorter(ukgc2012.data$rtof)
ggplot() + geom_bar(aes(x=na.omit(ukgc2012.data$hrt))) + opts(axis.text.x=theme_text(angle=-90,size=6)) + xlab(NULL)
#plot a bar chart of 'to' computed counts
ggplot() + geom_bar(aes(x=na.omit(ukgc2012.data$to))) + opts(axis.text.x=theme_text(angle=-90,size=6)) + xlab(NULL)
#plot a bar chart of 'from' computed counts
ggplot() + geom_bar(aes(x=na.omit(ukgc2012.data$from_user))) + opts(axis.text.x=theme_text(angle=-90,size=6)) + xlab(NULL)
#plot an ordered bar chart of 'from' tabulated counts
ukgc2012.counts$Name <- reorder(ukgc2012.counts$Name, ukgc2012.counts$toCount)
ggplot(ukgc2012.counts) + geom_bar(stat = "identity",aes(x=Name,y=toCount)) + opts(axis.text.x=theme_text(angle=-90,size=6)) + xlab(NULL)
#plot a scatterplot displaying to and from counts on x and ya axes, and label size as RT count
twViz.scatter2(ukgc2012.counts)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment