Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Analysis of the different ways to write the name Mohammad. More info: http://turki.ws/?p=14
require(tm)
require(wordcloud )
require(descr)
fileName <- "data.txt"
conn <- file(fileName,open="r")
names <-readLines(conn)
List <- c()
for (i in 1:length(names)){
#remove commas from the fullname
fullname <- gsub(",", "", names[i])
splitted <- unlist(strsplit(fullname, " "))
for(j in 1:length(splitted)) {
isVariation <- grepl("\\b[M][\\S]*[H][\\S]*[D]\\b", splitted[j], TRUE, TRUE)
#if name is variation of M%h%d, add it to List
if(isVariation)
List <- c(List, splitted[j])
}
}
close(conn)
#print List
print(List)
#print freq table of whole list
freq(List, plot = FALSE)
#generate word cloud for specific names
words=c("mohammed", "mohammad", "muhammad", "mohamed", "mohamad", "muhammed")
freq=c(638,573,83,71,17,5)
wordcloud(words, freq, max.words=6, random.order=TRUE, rot.per=0.0, use.r.layout=FALSE, colors=brewer.pal(8, "Set2"))
#you can generate word cloud for the whole List, just for fun :)
lords <- VCorpus(VectorSource(List))
wordcloud(lords, scale=c(100,1), max.words=100, random.order=FALSE, rot.per=0.0, use.r.layout=FALSE, colors=brewer.pal(8, "Set2"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment