Skip to content

Instantly share code, notes, and snippets.

@svendvn
Last active February 16, 2018 00:16
Show Gist options
  • Save svendvn/736c5cf5729a5ae5ac719ced2ed07238 to your computer and use it in GitHub Desktop.
Save svendvn/736c5cf5729a5ae5ac719ced2ed07238 to your computer and use it in GitHub Desktop.
a=read.table('usa_big.txt', header=F)
remove_and_make_numeric=function(s){
return(as.numeric(substr(x = s, start = 1, stop = nchar(s)-1)))
}
summarize=function(x){
y=x/100
within=sum(y)
res=0
for(i in 1:length(y)){
res=res+y[i]^2
}
res=res#+within^2/(500-20)
return(res)
}
calc_same_name_prob=function(xnames, xprobs, ynames, yprobs){
y_indexes_to_keep=which(ynames!='.')
x_indexes_to_keep=which(xnames!='.')
xprobs=xprobs[x_indexes_to_keep]
xnames=xnames[x_indexes_to_keep]
yprobs=yprobs[y_indexes_to_keep]
ynames=ynames[y_indexes_to_keep]
xlist=as.list(xprobs)
names(xlist) <- xnames
#print(xlist)
res=0
contribs=c()
names_c=c()
for(i in 1:length(ynames)){
if(ynames[i] %in% xnames){
#print(xlist[ynames[i]])
res=res+yprobs[i]*as.numeric(xlist[ynames[i]])/10000
names_c=c(names_c, ynames[i])
contribs=c(contribs,yprobs[i]*as.numeric(xlist[ynames[i]])/10000)
}
}
if(length(names_c)>0){
index=which.max(contribs)
print(paste(names_c[index],';',contribs[index]))
}
return(res)
}
colnames(a) <- c('rank', 'boy', 'boyperc', 'girl', 'girlperc')
a$boyperc <- sapply(as.character(a$boyperc), remove_and_make_numeric)
a$girlperc <- sapply(as.character(a$girlperc), remove_and_make_numeric)
a$year=rep(2016:1880, each=1000)
same_name=rep(0,nrow(a)/1000)
for(i in 1:(nrow(a)/1000)){
is=((i-1)*1000+1):(i*1000)
#print(is)
same_name[i]=calc_same_name_prob(as.character(a$boy)[is], a$boyperc[is],
as.character(a$girl)[is], a$girlperc[is])
#cat('.')
}
b=reshape(a, drop=c('girl', 'boy'),idvar=c('year'),v.names=c('boyperc','girlperc'),timevar='rank', direction='wide')
b$top10_boys=apply(b[,c(1:1000)*2],1,summarize)
b$top10_girls=apply(b[,c(1:1000)*2+1],1,summarize)
plot(b$year, b$top10_boys, col='blue', type='l')
points(b$year, b$top10_girls, col='red', type='l')
essential_1=b[,c(1,ncol(b)-1)]
essential_2=b[,c(1,ncol(b))]
new_df=as.data.frame(rbind(as.matrix(essential_1), as.matrix(essential_2)))
colnames(new_df) <- c('Jaro','Probableco')
new_df$Sekso=rep(c('Knabo','Knabino'), each=nrow(new_df)/2)
library(ggplot2)
xticks=c(1880+0:6*20,2016)
ggplot(data = new_df, aes(x=Jaro, y=Probableco, color=Sekso))+
geom_line()+ggtitle('Probableco, ke du beboj havas identajn personajn nomojn en Usono')+
scale_color_manual(values=c('red',"blue"))+ylim(c(0,0.025))+
scale_x_continuous(name= 'Naskigxjaro', breaks=xticks, limits=c(1880,2016))+theme_bw()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment