Analysis of the different ways to write the name Mohammad. More info: http://turki.ws/?p=14
require(tm) | |
require(wordcloud ) | |
require(descr) | |
fileName <- "data.txt" | |
conn <- file(fileName,open="r") | |
names <-readLines(conn) | |
List <- c() | |
for (i in 1:length(names)){ | |
#remove commas from the fullname | |
fullname <- gsub(",", "", names[i]) | |
splitted <- unlist(strsplit(fullname, " ")) | |
for(j in 1:length(splitted)) { | |
isVariation <- grepl("\\b[M][\\S]*[H][\\S]*[D]\\b", splitted[j], TRUE, TRUE) | |
#if name is variation of M%h%d, add it to List | |
if(isVariation) | |
List <- c(List, splitted[j]) | |
} | |
} | |
close(conn) | |
#print List | |
print(List) | |
#print freq table of whole list | |
freq(List, plot = FALSE) | |
#generate word cloud for specific names | |
words=c("mohammed", "mohammad", "muhammad", "mohamed", "mohamad", "muhammed") | |
freq=c(638,573,83,71,17,5) | |
wordcloud(words, freq, max.words=6, random.order=TRUE, rot.per=0.0, use.r.layout=FALSE, colors=brewer.pal(8, "Set2")) | |
#you can generate word cloud for the whole List, just for fun :) | |
lords <- VCorpus(VectorSource(List)) | |
wordcloud(lords, scale=c(100,1), max.words=100, random.order=FALSE, rot.per=0.0, use.r.layout=FALSE, colors=brewer.pal(8, "Set2")) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment