Last active
April 1, 2016 22:04
-
-
Save iturki/e7de628563c71ba60f1de6c4f12420c8 to your computer and use it in GitHub Desktop.
Analysis of the different ways to write the name Mohammad. More info: http://turki.ws/?p=14
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(tm) | |
require(wordcloud ) | |
require(descr) | |
fileName <- "data.txt" | |
conn <- file(fileName,open="r") | |
names <-readLines(conn) | |
List <- c() | |
for (i in 1:length(names)){ | |
#remove commas from the fullname | |
fullname <- gsub(",", "", names[i]) | |
splitted <- unlist(strsplit(fullname, " ")) | |
for(j in 1:length(splitted)) { | |
isVariation <- grepl("\\b[M][\\S]*[H][\\S]*[D]\\b", splitted[j], TRUE, TRUE) | |
#if name is variation of M%h%d, add it to List | |
if(isVariation) | |
List <- c(List, splitted[j]) | |
} | |
} | |
close(conn) | |
#print List | |
print(List) | |
#print freq table of whole list | |
freq(List, plot = FALSE) | |
#generate word cloud for specific names | |
words=c("mohammed", "mohammad", "muhammad", "mohamed", "mohamad", "muhammed") | |
freq=c(638,573,83,71,17,5) | |
wordcloud(words, freq, max.words=6, random.order=TRUE, rot.per=0.0, use.r.layout=FALSE, colors=brewer.pal(8, "Set2")) | |
#you can generate word cloud for the whole List, just for fun :) | |
lords <- VCorpus(VectorSource(List)) | |
wordcloud(lords, scale=c(100,1), max.words=100, random.order=FALSE, rot.per=0.0, use.r.layout=FALSE, colors=brewer.pal(8, "Set2")) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment