Skip to content

Instantly share code, notes, and snippets.

@jlehtoma
Created December 23, 2011 08:47
Show Gist options
  • Save jlehtoma/1513613 to your computer and use it in GitHub Desktop.
Save jlehtoma/1513613 to your computer and use it in GitHub Desktop.
Fetch Twitter archive from Twapperkeeper and preprocess and visualize content
require(stringr)
#A helper function to remove @ symbols from user names...
trim <- function (x) sub('@','',x)
twapperkeeperCSVParse=function(fp){
df = read.csv(fp, header=F)
df$from=sapply(df$V1,function(tweet) str_extract(tweet,"^([[:alnum:]_]*)"))
df$id=sapply(df$V1,function(tweet) str_extract(tweet,"[[:digit:]/s]*$"))
df$txt=sapply(df$V1,function(tweet) str_trim(str_replace(str_sub(str_replace(tweet,'- tweet id [[:digit:]/s]*$',''),end=-35),"^([[:alnum:]_]*:)",'')))
df$to=sapply(df$txt,function(tweet) trim(str_extract(tweet,"^(@[[:alnum:]_]*)")))
df$rt=sapply(df$txt,function(tweet) trim(str_match(tweet,"^RT (@[[:alnum:]_]*)")[2]))
return(df)
}
#usage:
#twarchive.df=twapperkeeperCSVParse("PATH_TO_YOUR_FILE")
#For example:
df=twapperkeeperCSVParse("reports/twArchive_ICCB.txt")
ats.df <- data.frame(df$from,df$to)
rts.df <- data.frame(df$from,df$rt)
#Cribbing http://blog.ynada.com/339
require(igraph)
ats.g <- graph.data.frame(ats.df, directed=T)
rts.g <- graph.data.frame(rts.df, directed=T)
write.graph(ats.g, file="ats.graphml", format="graphml")
write.graph(rts.g, file="rts.graphml", format="graphml")
require(ggplot2)
# Reorder data frame based on retweets
rtOrdered <- transform(df,
rt = ordered(rt, levels = names( sort(-table(rt), decreasing=T))))
# Plot retweet counts
ggplot() + geom_bar(aes(x=na.omit(rtOrdered$rt))) +
opts(axis.text.x=theme_text(size=8)) + xlab(NULL) + coord_flip() +
xlab("User") + ylab("Retweets count")
# Select original tweets (non-retweeted)
df.original <- df[is.na(df$rt),]
# Reorder data frame based on tweets
tweetsOrdered <- transform(df.original,
from = ordered(from, levels = names( sort(-table(from), decreasing=T))))
tweet.count = data.frame(table(df.original$f))
filter <- subset(tweet.count, Freq > 1)
filter.tweetsOrdered <- droplevels(subset(tweetsOrdered, from %in% filter$Var1))
# Plot tweet counts
ggplot() + geom_bar(aes(x=na.omit(filter.tweetsOrdered$from))) +
opts(axis.text.x=theme_text(size=8)) + xlab(NULL) + coord_flip() +
xlab("User") + ylab("Tweets count")
#count the occurrences of each name in the rt column
rt.count = data.frame(table(df$rt))
#sort the results in descending order and display the top 5 results
head(rt.count[order(-rt.count$Freq),],5)
#There are probably better ways of doing that! If so, let me know via comments
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment