Skip to content

Instantly share code, notes, and snippets.

@drewconway
Created March 12, 2010 16:26
Show Gist options
  • Save drewconway/330479 to your computer and use it in GitHub Desktop.
Save drewconway/330479 to your computer and use it in GitHub Desktop.
library(ggplot2)
library(XML)
### Meetup topics word cloud ###
# Get the raw meetup description into a dataframe
raw_desc<-levels(read.table('descriptions.txt',sep="\n")$V1)
clean_strings<-function(s){
low<-tolower(s)
clean<-gsub("[[:punct:]\n]","",low)
return(strsplit(clean," "))
}
word_vector<-unlist(lapply(raw_desc,clean_strings))
words<-as.data.frame(table(word_vector[which(word_vector!="")]))
colnames(words)<-c("WORD","COUNT")
# Retrieve 100 most common English words from Wikipedia,
# and remove them from data frame
com_words<-melt(readHTMLTable("http://en.wikipedia.org/wiki/Most_common_words_in_English"))
com_words<-tolower(as.vector(com_words$Word))
com_words<-append(com_words,c("is","are")) # Somehow, 'is' and 'are' are not among the 100 most common words
word_ind<-match(words$WORD,com_words)
words_clean<-words[which(is.na(word_ind)),]
words_final<-words_clean[which(words_clean$COUNT>1),] # Restrict cloud to words appearing more than once
# Now, let's make a word cloud in ggplot2
x<-runif(nrow(words_final),-1.5,1.5)
y<-runif(nrow(words_final),-1,1)
words_final<-transform(words_final,X=x,Y=y)
png("meetup_cloud.png",res=400,height=800,width=1000)
ggplot(words_final,aes(X,Y))+geom_text(aes(label=WORD,size=COUNT))+opts(legend.position="none")+xlab("")+ylab("")+
scale_x_continuous(breaks=c(-2,0,2),labels=c("","",""))+
scale_y_continuous(breaks=c(-1.5,0,1.5),labels=c("","",""))
dev.off()
# This requires some trial and error to minimize overlap, there is probably
# a better way to specify locations to avoid overlap
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment