jalapic/gist:2b9d7e4b9e8b7119d965

## gistfile1.txt
### load packages
library(babynames)
library(dplyr)
library(tidyr)
library(ggplot2)
library(gridExtra)
library(magrittr)


head(babynames)

tail(babynames)

g1 <- babynames %>%
  filter(name=="Barbara") %$%
  ggplot(., aes(year, n)) +
  geom_line(aes(color=sex), lwd=1) +
  scale_color_manual(values = c("firebrick1", "dodgerblue")) +
  theme_bw() +
  ggtitle("Barbara")

g2 <- babynames %>%
  filter(name=="Megan") %$%
  ggplot(., aes(year, n)) +
  geom_line(aes(color=sex), lwd=1) +
  scale_color_manual(values = c("firebrick1", "dodgerblue")) +
  theme_bw() +
  ggtitle("Megan")

g3 <- babynames %>%
  filter(name=="Jennifer") %$%
  ggplot(., aes(year, n)) +
  geom_line(aes(color=sex), lwd=1) +
  scale_color_manual(values = c("firebrick1", "dodgerblue")) +
  theme_bw() +
  ggtitle("Jennifer")

g4 <- babynames %>%
  filter(name=="Irene") %$%
  ggplot(., aes(year, n)) +
  geom_line(aes(color=sex), lwd=1) +
  scale_color_manual(values = c("firebrick1", "dodgerblue")) +
  theme_bw() +
  ggtitle("Irene")

grid.arrange(g1,g2,g3,g4,ncol=2)


#popular names
babynames %>%
  group_by(sex, name) %>%
  summarize(total = sum(n)) %>%
  arrange(desc(total)) %$%
  split(., sex)

#total number unique names
babynames %$%
  split(., sex) %>%
  lapply(. %$% length(unique(name)))


#reshape data
babywideF <-
  babynames %>%
  filter(sex=="F") %>%
  select(name, year, n) %>%
  spread(year, n, fill=0)

rownames(babywideF)<- babywideF %>% .$name  #set rownames
babywideF %<>% select(-name) # remove name var


#principal components analysis
### principal components analysis - females
resF.pca <- princomp(babywideF)
plot(resF.pca)


###k-means clustering analysis
set.seed(100)
resF.k <- kmeans(babywideF, 6)
table(resF.k$cluster)

names(resF.k$cluster[resF.k$cluster==2])
names(resF.k$cluster[resF.k$cluster==3])
names(resF.k$cluster[resF.k$cluster==4])
names(resF.k$cluster[resF.k$cluster==5])
names(resF.k$cluster[resF.k$cluster==6])

#cluster 1
set.seed(10)
sample(names(resF.k$cluster[resF.k$cluster==1]),10)


# Repeat the process
# It might be more beneficial to repeat this process, but only include those names
# in the top 6 components. To do this we will filter our data by not keeping any names
# that appear in cluster 5. This leaves us with 290 names.

group1x <- names(resF.k$cluster[resF.k$cluster==1])
babywideF1 <- babywideF %>% mutate(id = rownames(.)) %>% filter(!id %in% group1x)
rownames(babywideF1) <- babywideF1$id #using this temp var to re-insert names into rownames (probably not the best way of doing this)
babywideF1 %<>% select(-id)

### principal components analysis - females
resF1.pca <- princomp(babywideF1)
plot(resF1.pca)


###k-means clustering analysis
set.seed(10)
resF1.k <- kmeans(babywideF1, 7)
table(resF1.k$cluster)

names(resF1.k$cluster[resF1.k$cluster==4])


#Mary
babynames %>%
  filter(sex=="F") %>%
  filter(name=="Mary") %$%
  ggplot(., aes(year, n)) +
  geom_line(lwd=1, color="red") +
  theme_bw()


#group 3
group3 <- names(resF1.k$cluster[resF1.k$cluster==3])

gg3 <-
  babynames %>%
  filter(sex=="F") %>%
  filter(name %in% group3) %$%
  ggplot(., aes(year, n)) +
  geom_line(aes(color=name, group=name), lwd=1) +
  theme_bw()

gg3


#group 6
group6 <- names(resF1.k$cluster[resF1.k$cluster==6])

gg6 <-
  babynames %>%
  filter(sex=="F") %>%
  filter(name %in% group6) %$%
  ggplot(., aes(year, n)) +
  geom_line(aes(color=name, group=name), lwd=1) +
  theme_bw()

gg6


# Elizabeth
babynames %>%
  filter(sex=="F") %>%
  filter(name=="Elizabeth") %$%
  ggplot(., aes(year, n)) +
  geom_line(lwd=1, color="red") +
  theme_bw()

#group1
group1 <- names(resF1.k$cluster[resF1.k$cluster==1])

gg1 <-
  babynames %>%
  filter(sex=="F") %>%
  filter(name %in% group1) %$%
  ggplot(., aes(year, n)) +
  geom_line(aes(color=name, group=name), lwd=1) +
  theme_bw()

gg1


#group2
group2 <- names(resF1.k$cluster[resF1.k$cluster==2])

gg2 <-
  babynames %>%
  filter(sex=="F") %>%
  filter(name %in% group2) %$%
  ggplot(., aes(year, n)) +
  geom_line(aes(color=name, group=name), lwd=1) +
  theme_bw()

gg2


# Group 5 is the second biggest group and it has 66 names.

group5 <- names(resF1.k$cluster[resF1.k$cluster==5])

group5 #these are these names


gg5 <-
  babynames %>%
  filter(sex=="F") %>%
  filter(name %in% group5) %$%
  ggplot(., aes(year, n)) +
  geom_line(aes(color=name, group=name), lwd=.5) +
  theme_bw() +
  theme(legend.position = "none")


gg5


#double boomers
babynames %>%
  filter(sex=="F") %>%
  filter(name=="Grace" | name=="Julia" | name=="Ella") %$%
  ggplot(., aes(year, n)) +
  geom_line(aes(color=name), lwd=1) +
  theme_bw()


#group7
group7 <- names(resF1.k$cluster[resF1.k$cluster==7])
group7 # these are these names


gg7 <-
  babynames %>%
  filter(sex=="F") %>%
  filter(name %in% group7) %$%
  ggplot(., aes(year, n)) +
  geom_line(aes(color=name, group=name), lwd=.5) +
  theme_bw() +
  theme(legend.position = "none")

gg7


## overview

grid.arrange(gg2, gg1, gg3, gg6, ncol=2)


####  T-distributed Stochastic Neighbor Embedding.

library(tsne)
D <- dist(babywideF1)  #create distance object


# creating dataframe for plotting colors and text on final plot
namesdf <- data.frame(name = c(group1, group2, group3, "Mary", group5, group6, group7),
                      group = c(rep(1, length(group1)), rep(2, length(group2)), rep(3, length(group3)), rep(4, 1),
                                rep(5, length(group5)), rep(6, length(group6)), rep(7, length(group7)))
)

namesdf %<>% arrange(name) #names in correct order to match rownames of babywideF1

colors = rainbow(7)
names(colors) = unique(namesdf$group)

#define function used in plotting
ecb = function(x,y){ plot(x,t='n'); text(x,labels=rownames(babywideF1), col=colors[namesdf$group], cex=1) }

#plot
tsne_D = tsne(D, k=2,  epoch_callback = ecb, perplexity=50)
	### load packages
	library(babynames)
	library(dplyr)
	library(tidyr)
	library(ggplot2)
	library(gridExtra)
	library(magrittr)


	head(babynames)

	tail(babynames)

	g1 <- babynames %>%
	filter(name=="Barbara") %$%
	ggplot(., aes(year, n)) +
	geom_line(aes(color=sex), lwd=1) +
	scale_color_manual(values = c("firebrick1", "dodgerblue")) +
	theme_bw() +
	ggtitle("Barbara")

	g2 <- babynames %>%
	filter(name=="Megan") %$%
	ggplot(., aes(year, n)) +
	geom_line(aes(color=sex), lwd=1) +
	scale_color_manual(values = c("firebrick1", "dodgerblue")) +
	theme_bw() +
	ggtitle("Megan")

	g3 <- babynames %>%
	filter(name=="Jennifer") %$%
	ggplot(., aes(year, n)) +
	geom_line(aes(color=sex), lwd=1) +
	scale_color_manual(values = c("firebrick1", "dodgerblue")) +
	theme_bw() +
	ggtitle("Jennifer")

	g4 <- babynames %>%
	filter(name=="Irene") %$%
	ggplot(., aes(year, n)) +
	geom_line(aes(color=sex), lwd=1) +
	scale_color_manual(values = c("firebrick1", "dodgerblue")) +
	theme_bw() +
	ggtitle("Irene")

	grid.arrange(g1,g2,g3,g4,ncol=2)


	#popular names
	babynames %>%
	group_by(sex, name) %>%
	summarize(total = sum(n)) %>%
	arrange(desc(total)) %$%
	split(., sex)

	#total number unique names
	babynames %$%
	split(., sex) %>%
	lapply(. %$% length(unique(name)))


	#reshape data
	babywideF <-
	babynames %>%
	filter(sex=="F") %>%
	select(name, year, n) %>%
	spread(year, n, fill=0)

	rownames(babywideF)<- babywideF %>% .$name #set rownames
	babywideF %<>% select(-name) # remove name var



	#principal components analysis
	### principal components analysis - females
	resF.pca <- princomp(babywideF)
	plot(resF.pca)



	###k-means clustering analysis
	set.seed(100)
	resF.k <- kmeans(babywideF, 6)
	table(resF.k$cluster)

	names(resF.k$cluster[resF.k$cluster==2])
	names(resF.k$cluster[resF.k$cluster==3])
	names(resF.k$cluster[resF.k$cluster==4])
	names(resF.k$cluster[resF.k$cluster==5])
	names(resF.k$cluster[resF.k$cluster==6])

	#cluster 1
	set.seed(10)
	sample(names(resF.k$cluster[resF.k$cluster==1]),10)



	# Repeat the process
	# It might be more beneficial to repeat this process, but only include those names
	# in the top 6 components. To do this we will filter our data by not keeping any names
	# that appear in cluster 5. This leaves us with 290 names.

	group1x <- names(resF.k$cluster[resF.k$cluster==1])
	babywideF1 <- babywideF %>% mutate(id = rownames(.)) %>% filter(!id %in% group1x)
	rownames(babywideF1) <- babywideF1$id #using this temp var to re-insert names into rownames (probably not the best way of doing this)
	babywideF1 %<>% select(-id)

	### principal components analysis - females
	resF1.pca <- princomp(babywideF1)
	plot(resF1.pca)


	###k-means clustering analysis
	set.seed(10)
	resF1.k <- kmeans(babywideF1, 7)
	table(resF1.k$cluster)

	names(resF1.k$cluster[resF1.k$cluster==4])


	#Mary
	babynames %>%
	filter(sex=="F") %>%
	filter(name=="Mary") %$%
	ggplot(., aes(year, n)) +
	geom_line(lwd=1, color="red") +
	theme_bw()


	#group 3
	group3 <- names(resF1.k$cluster[resF1.k$cluster==3])

	gg3 <-
	babynames %>%
	filter(sex=="F") %>%
	filter(name %in% group3) %$%
	ggplot(., aes(year, n)) +
	geom_line(aes(color=name, group=name), lwd=1) +
	theme_bw()

	gg3


	#group 6
	group6 <- names(resF1.k$cluster[resF1.k$cluster==6])

	gg6 <-
	babynames %>%
	filter(sex=="F") %>%
	filter(name %in% group6) %$%
	ggplot(., aes(year, n)) +
	geom_line(aes(color=name, group=name), lwd=1) +
	theme_bw()

	gg6


	# Elizabeth
	babynames %>%
	filter(sex=="F") %>%
	filter(name=="Elizabeth") %$%
	ggplot(., aes(year, n)) +
	geom_line(lwd=1, color="red") +
	theme_bw()

	#group1
	group1 <- names(resF1.k$cluster[resF1.k$cluster==1])

	gg1 <-
	babynames %>%
	filter(sex=="F") %>%
	filter(name %in% group1) %$%
	ggplot(., aes(year, n)) +
	geom_line(aes(color=name, group=name), lwd=1) +
	theme_bw()

	gg1


	#group2
	group2 <- names(resF1.k$cluster[resF1.k$cluster==2])

	gg2 <-
	babynames %>%
	filter(sex=="F") %>%
	filter(name %in% group2) %$%
	ggplot(., aes(year, n)) +
	geom_line(aes(color=name, group=name), lwd=1) +
	theme_bw()

	gg2



	# Group 5 is the second biggest group and it has 66 names.

	group5 <- names(resF1.k$cluster[resF1.k$cluster==5])

	group5 #these are these names


	gg5 <-
	babynames %>%
	filter(sex=="F") %>%
	filter(name %in% group5) %$%
	ggplot(., aes(year, n)) +
	geom_line(aes(color=name, group=name), lwd=.5) +
	theme_bw() +
	theme(legend.position = "none")


	gg5



	#double boomers
	babynames %>%
	filter(sex=="F") %>%
	filter(name=="Grace" \| name=="Julia" \| name=="Ella") %$%
	ggplot(., aes(year, n)) +
	geom_line(aes(color=name), lwd=1) +
	theme_bw()


	#group7
	group7 <- names(resF1.k$cluster[resF1.k$cluster==7])
	group7 # these are these names


	gg7 <-
	babynames %>%
	filter(sex=="F") %>%
	filter(name %in% group7) %$%
	ggplot(., aes(year, n)) +
	geom_line(aes(color=name, group=name), lwd=.5) +
	theme_bw() +
	theme(legend.position = "none")

	gg7


	## overview

	grid.arrange(gg2, gg1, gg3, gg6, ncol=2)


	#### T-distributed Stochastic Neighbor Embedding.

	library(tsne)
	D <- dist(babywideF1) #create distance object


	# creating dataframe for plotting colors and text on final plot
	namesdf <- data.frame(name = c(group1, group2, group3, "Mary", group5, group6, group7),
	group = c(rep(1, length(group1)), rep(2, length(group2)), rep(3, length(group3)), rep(4, 1),
	rep(5, length(group5)), rep(6, length(group6)), rep(7, length(group7)))
	)

	namesdf %<>% arrange(name) #names in correct order to match rownames of babywideF1

	colors = rainbow(7)
	names(colors) = unique(namesdf$group)

	#define function used in plotting
	ecb = function(x,y){ plot(x,t='n'); text(x,labels=rownames(babywideF1), col=colors[namesdf$group], cex=1) }

	#plot
	tsne_D = tsne(D, k=2, epoch_callback = ecb, perplexity=50)