Skip to content

Instantly share code, notes, and snippets.

@jalapic
Created April 6, 2015 20:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jalapic/2b9d7e4b9e8b7119d965 to your computer and use it in GitHub Desktop.
Save jalapic/2b9d7e4b9e8b7119d965 to your computer and use it in GitHub Desktop.
babynames MDS
### load packages
library(babynames)
library(dplyr)
library(tidyr)
library(ggplot2)
library(gridExtra)
library(magrittr)
head(babynames)
tail(babynames)
g1 <- babynames %>%
filter(name=="Barbara") %$%
ggplot(., aes(year, n)) +
geom_line(aes(color=sex), lwd=1) +
scale_color_manual(values = c("firebrick1", "dodgerblue")) +
theme_bw() +
ggtitle("Barbara")
g2 <- babynames %>%
filter(name=="Megan") %$%
ggplot(., aes(year, n)) +
geom_line(aes(color=sex), lwd=1) +
scale_color_manual(values = c("firebrick1", "dodgerblue")) +
theme_bw() +
ggtitle("Megan")
g3 <- babynames %>%
filter(name=="Jennifer") %$%
ggplot(., aes(year, n)) +
geom_line(aes(color=sex), lwd=1) +
scale_color_manual(values = c("firebrick1", "dodgerblue")) +
theme_bw() +
ggtitle("Jennifer")
g4 <- babynames %>%
filter(name=="Irene") %$%
ggplot(., aes(year, n)) +
geom_line(aes(color=sex), lwd=1) +
scale_color_manual(values = c("firebrick1", "dodgerblue")) +
theme_bw() +
ggtitle("Irene")
grid.arrange(g1,g2,g3,g4,ncol=2)
#popular names
babynames %>%
group_by(sex, name) %>%
summarize(total = sum(n)) %>%
arrange(desc(total)) %$%
split(., sex)
#total number unique names
babynames %$%
split(., sex) %>%
lapply(. %$% length(unique(name)))
#reshape data
babywideF <-
babynames %>%
filter(sex=="F") %>%
select(name, year, n) %>%
spread(year, n, fill=0)
rownames(babywideF)<- babywideF %>% .$name #set rownames
babywideF %<>% select(-name) # remove name var
#principal components analysis
### principal components analysis - females
resF.pca <- princomp(babywideF)
plot(resF.pca)
###k-means clustering analysis
set.seed(100)
resF.k <- kmeans(babywideF, 6)
table(resF.k$cluster)
names(resF.k$cluster[resF.k$cluster==2])
names(resF.k$cluster[resF.k$cluster==3])
names(resF.k$cluster[resF.k$cluster==4])
names(resF.k$cluster[resF.k$cluster==5])
names(resF.k$cluster[resF.k$cluster==6])
#cluster 1
set.seed(10)
sample(names(resF.k$cluster[resF.k$cluster==1]),10)
# Repeat the process
# It might be more beneficial to repeat this process, but only include those names
# in the top 6 components. To do this we will filter our data by not keeping any names
# that appear in cluster 5. This leaves us with 290 names.
group1x <- names(resF.k$cluster[resF.k$cluster==1])
babywideF1 <- babywideF %>% mutate(id = rownames(.)) %>% filter(!id %in% group1x)
rownames(babywideF1) <- babywideF1$id #using this temp var to re-insert names into rownames (probably not the best way of doing this)
babywideF1 %<>% select(-id)
### principal components analysis - females
resF1.pca <- princomp(babywideF1)
plot(resF1.pca)
###k-means clustering analysis
set.seed(10)
resF1.k <- kmeans(babywideF1, 7)
table(resF1.k$cluster)
names(resF1.k$cluster[resF1.k$cluster==4])
#Mary
babynames %>%
filter(sex=="F") %>%
filter(name=="Mary") %$%
ggplot(., aes(year, n)) +
geom_line(lwd=1, color="red") +
theme_bw()
#group 3
group3 <- names(resF1.k$cluster[resF1.k$cluster==3])
gg3 <-
babynames %>%
filter(sex=="F") %>%
filter(name %in% group3) %$%
ggplot(., aes(year, n)) +
geom_line(aes(color=name, group=name), lwd=1) +
theme_bw()
gg3
#group 6
group6 <- names(resF1.k$cluster[resF1.k$cluster==6])
gg6 <-
babynames %>%
filter(sex=="F") %>%
filter(name %in% group6) %$%
ggplot(., aes(year, n)) +
geom_line(aes(color=name, group=name), lwd=1) +
theme_bw()
gg6
# Elizabeth
babynames %>%
filter(sex=="F") %>%
filter(name=="Elizabeth") %$%
ggplot(., aes(year, n)) +
geom_line(lwd=1, color="red") +
theme_bw()
#group1
group1 <- names(resF1.k$cluster[resF1.k$cluster==1])
gg1 <-
babynames %>%
filter(sex=="F") %>%
filter(name %in% group1) %$%
ggplot(., aes(year, n)) +
geom_line(aes(color=name, group=name), lwd=1) +
theme_bw()
gg1
#group2
group2 <- names(resF1.k$cluster[resF1.k$cluster==2])
gg2 <-
babynames %>%
filter(sex=="F") %>%
filter(name %in% group2) %$%
ggplot(., aes(year, n)) +
geom_line(aes(color=name, group=name), lwd=1) +
theme_bw()
gg2
# Group 5 is the second biggest group and it has 66 names.
group5 <- names(resF1.k$cluster[resF1.k$cluster==5])
group5 #these are these names
gg5 <-
babynames %>%
filter(sex=="F") %>%
filter(name %in% group5) %$%
ggplot(., aes(year, n)) +
geom_line(aes(color=name, group=name), lwd=.5) +
theme_bw() +
theme(legend.position = "none")
gg5
#double boomers
babynames %>%
filter(sex=="F") %>%
filter(name=="Grace" | name=="Julia" | name=="Ella") %$%
ggplot(., aes(year, n)) +
geom_line(aes(color=name), lwd=1) +
theme_bw()
#group7
group7 <- names(resF1.k$cluster[resF1.k$cluster==7])
group7 # these are these names
gg7 <-
babynames %>%
filter(sex=="F") %>%
filter(name %in% group7) %$%
ggplot(., aes(year, n)) +
geom_line(aes(color=name, group=name), lwd=.5) +
theme_bw() +
theme(legend.position = "none")
gg7
## overview
grid.arrange(gg2, gg1, gg3, gg6, ncol=2)
#### T-distributed Stochastic Neighbor Embedding.
library(tsne)
D <- dist(babywideF1) #create distance object
# creating dataframe for plotting colors and text on final plot
namesdf <- data.frame(name = c(group1, group2, group3, "Mary", group5, group6, group7),
group = c(rep(1, length(group1)), rep(2, length(group2)), rep(3, length(group3)), rep(4, 1),
rep(5, length(group5)), rep(6, length(group6)), rep(7, length(group7)))
)
namesdf %<>% arrange(name) #names in correct order to match rownames of babywideF1
colors = rainbow(7)
names(colors) = unique(namesdf$group)
#define function used in plotting
ecb = function(x,y){ plot(x,t='n'); text(x,labels=rownames(babywideF1), col=colors[namesdf$group], cex=1) }
#plot
tsne_D = tsne(D, k=2, epoch_callback = ecb, perplexity=50)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment