Created
March 12, 2010 16:26
-
-
Save drewconway/330479 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ggplot2) | |
library(XML) | |
### Meetup topics word cloud ### | |
# Get the raw meetup description into a dataframe | |
raw_desc<-levels(read.table('descriptions.txt',sep="\n")$V1) | |
clean_strings<-function(s){ | |
low<-tolower(s) | |
clean<-gsub("[[:punct:]\n]","",low) | |
return(strsplit(clean," ")) | |
} | |
word_vector<-unlist(lapply(raw_desc,clean_strings)) | |
words<-as.data.frame(table(word_vector[which(word_vector!="")])) | |
colnames(words)<-c("WORD","COUNT") | |
# Retrieve 100 most common English words from Wikipedia, | |
# and remove them from data frame | |
com_words<-melt(readHTMLTable("http://en.wikipedia.org/wiki/Most_common_words_in_English")) | |
com_words<-tolower(as.vector(com_words$Word)) | |
com_words<-append(com_words,c("is","are")) # Somehow, 'is' and 'are' are not among the 100 most common words | |
word_ind<-match(words$WORD,com_words) | |
words_clean<-words[which(is.na(word_ind)),] | |
words_final<-words_clean[which(words_clean$COUNT>1),] # Restrict cloud to words appearing more than once | |
# Now, let's make a word cloud in ggplot2 | |
x<-runif(nrow(words_final),-1.5,1.5) | |
y<-runif(nrow(words_final),-1,1) | |
words_final<-transform(words_final,X=x,Y=y) | |
png("meetup_cloud.png",res=400,height=800,width=1000) | |
ggplot(words_final,aes(X,Y))+geom_text(aes(label=WORD,size=COUNT))+opts(legend.position="none")+xlab("")+ylab("")+ | |
scale_x_continuous(breaks=c(-2,0,2),labels=c("","",""))+ | |
scale_y_continuous(breaks=c(-1.5,0,1.5),labels=c("","","")) | |
dev.off() | |
# This requires some trial and error to minimize overlap, there is probably | |
# a better way to specify locations to avoid overlap |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment