Embed URL

HTTPS clone URL

SSH clone URL

You can clone with HTTPS or SSH.

Download Gist

Graphical Anlaysis of Trends in Name Data

View gist:0b54b14d3040d3e89709
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
require(plyr)
require(ggplot2)
require(scales)
 
# Download data from:
# http://www.ssa.gov/oact/babynames/names.zip
setwd("C:/Data/SS-names/")
files<-list.files()
files<-files[grepl(".txt",files)]
 
###### Reading files
namedata <- matrix(0,ncol=4,nrow=0)
 
for (i in 1:length(files))
namedata<-rbind(namedata,
cbind(read.csv(files[i],header=F), substr(files[i],4,7)))
 
colnames(namedata)<-c("name","gender","count", "year")
 
dim(namedata)
# 1.8 million rows
 
Mdata<-namedata[namedata$gender=="M",]
Fdata<-namedata[namedata$gender=="F",]
 
Msums <- ddply(Mdata, .(name), summarize, sum=sum(count))
Fsums <- ddply(Fdata, .(name), summarize, sum=sum(count))
 
nrow(Msums); nrow(Fsums)
# There are 38601 male names and 64089 female names
 
Morder <- Msums[order(Msums[,2], decreasing = TRUE),]
Forder <- Fsums[order(Fsums[,2], decreasing = TRUE),]
 
c <- ggplot(Morder[1:20,], aes(x = name, y = sum, size=sum))
c + geom_point() + coord_flip() + theme(legend.position="none")+
ggtitle("20 Most Popular Male Names Since 1880")+
xlab("")+scale_y_continuous(name="Names Recorded With Social Security Administration", labels = comma)
# Figure 1
c <- ggplot(Forder[1:20,], aes(x = name, y = sum, size=sum))
c + geom_point() + coord_flip() + theme(legend.position="none")+
ggtitle("20 Most Popular Female Names Since 1880")+
xlab("")+scale_y_continuous(name="Names Recorded With Social Security Administration", labels = comma)
# Figure 2
 
Mdata$order <- Fdata$torder <- NA # Create a variable for
Mdata$prop <- Fdata$prop <- NA
 
for (i in 1880:2013) {
Mdata[Mdata$year==i, "torder"] <-
order(-Mdata[Mdata$year==i, "count"])
Mdata[Mdata$year==i, "prop"] <-
(Mdata[Mdata$year==i, "count"])/
sum((Mdata[Mdata$year==i, "count"]))
Fdata[Fdata$year==i, "torder"] <-
order(-Fdata[Fdata$year==i, "count"])
Fdata[Fdata$year==i, "prop"] <-
(Fdata[Fdata$year==i, "count"])/
sum((Fdata[Fdata$year==i, "count"]))
}
 
top <- 7
 
Mrestricted <- Mdata[Mdata$name%in%Morder[1:top,1],]
Frestricted <- Fdata[Fdata$name%in%Forder[1:top,1],]
 
ggplot(Mrestricted, aes(x=year, y=count, group=name, color=name))+
geom_line(size=1)+scale_x_discrete(breaks=seq(1880,2010,20))
 
ggplot(Mrestricted, aes(x=year, y=prop, group=name, color=name))+
geom_line(size=1)+scale_x_discrete(breaks=seq(1880,2010,20))+
ylab("Proportion of Total Names")
 
ggplot(Mrestricted,
aes(x=year, y=torder, group=name, color=name, size=torder))+
geom_line()+scale_x_discrete(breaks=seq(1880,2010,20))+
ylab("Order of Total Names That Year (log10)")+scale_y_log10()
 
ggplot(Frestricted, aes(x=year, y=count, group=name, color=name))+
geom_line(size=1)+scale_x_discrete(breaks=seq(1880,2010,20))
 
ggplot(Frestricted, aes(x=year, y=prop, group=name, color=name))+
geom_line(size=1)+scale_x_discrete(breaks=seq(1880,2010,20))+
ylab("Proportion of Total Names")
 
ggplot(Frestricted,
aes(x=year, y=torder, group=name, color=name, size=torder))+
geom_line()+scale_x_discrete(breaks=seq(1880,2010,20))+
ylab("Order of Total Names That Year (log10)")+scale_y_log10()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.