sashagusev/clark_simulation_and_fit.R

## clark_simulation_and_fit.R

get_correlated_trait = function(input,r) {
  r * (input) + sqrt(1-r^2) * rnorm(length(input),0,1)
}
get_noisy_trait = function(input,noise) {
  sqrt(1-noise) * (input) + sqrt(noise) * rnorm(length(input),0,1)
}

simulate_non_genetic = function(N,AM,ENV,err) {
  set.seed(42)

  kins = c(1,2,3,5,7,9)
  corrs = rep(NA,length(kins))

  # simulate individuals with some amount of wealth
  parent_A = rnorm(N,0,1)
  # simulate their partner with correlated wealth
  parent_B = get_correlated_trait( parent_A , AM )
  # measurement error
  m_parent_A = get_noisy_trait(parent_A,err)
  m_parent_B = get_noisy_trait(parent_A,err)

  # simulate siblings
  # generate two offspring who get mean wealth of the parents + noise
  child_A = get_noisy_trait( ((parent_A+parent_B)/2) , ENV )
  child_B = get_noisy_trait( ((parent_A+parent_B)/2) , ENV )
  # measurement error
  m_child_A = get_noisy_trait(child_A,err)
  m_child_B = get_noisy_trait(child_B,err)
  corrs[1] = cor(m_child_A,m_child_B)

  # simulate their partner with correlated wealth
  child_A_partner = get_correlated_trait(child_A,AM)
  child_B_partner = get_correlated_trait(child_B,AM)
  # generate two offspring who get mean wealth of the parents + noise
  child_A_child = get_noisy_trait( ((child_A+child_A_partner)/2) , ENV )
  child_B_child = get_noisy_trait( ((child_B+child_B_partner)/2) , ENV )
  # add measurement error
  m_child_A_child = get_noisy_trait(child_A_child,err)
  m_child_B_child = get_noisy_trait(child_B_child,err)

  # --- sibling-r
  corrs[2] = cor(m_child_A_child,m_child_B)
  # -- cousins
  corrs[3] = cor(m_child_A_child,m_child_B_child)

  # continue for three more generations
  for ( ctr in 4:6 ) {
    child_A = child_A_child
    child_B = child_B_child
    child_A_partner = get_correlated_trait(child_A,AM)
    child_B_partner = get_correlated_trait(child_B,AM)
    # generate two offspring who get mean wealth of the parents + noise
    child_A_child = get_noisy_trait( ((child_A+child_A_partner)/2) , ENV )
    child_B_child = get_noisy_trait( ((child_B+child_B_partner)/2) , ENV )
    # -- cousins-2
    # add measurement error
    m_child_A_child = get_noisy_trait(child_A_child,err)
    m_child_B_child = get_noisy_trait(child_B_child,err)
    corrs[ctr] = cor(m_child_A_child,m_child_B_child)
  }

  corrs
}


labs = c("Sibling","Sibling-r","Cousin","Cousin2","Cousin3","Cousin4")
kins = c(1,2,3,5,7,9)
# Clark Occ-Stat correlations:
clark_obs = c(0.578,0.502,0.431,0.266,0.139,0.071)

# Best fitting parameters
N = 50000
AM = 0.55
ENV = 0
err = 0.35

corrs = simulate_non_genetic(N,AM,ENV,err)

plot_clark = function( plot_corrs , plot_obs , plot_labs , title ) {
  plot(plot_corrs,plot_obs,las=1,cex=1.5,col="yellow",pch=19,xlim=c(0,0.8),ylim=c(0,0.8),xlab="Predicted Correlation",ylab="Observed Correlation",main=title )
  points(plot_corrs,plot_obs,cex=1.5,lwd=2)
  text(plot_corrs,plot_obs,plot_labs,pos=4)

  reg = lm(plot_obs~plot_corrs)
  abline(reg,lty=2,lwd=2)

  rsq = cor(plot_obs,plot_corrs)^2
  model_rmse = sqrt(mean((c(0,plot_corrs) - c(0,plot_obs))^2))
  legend("bottomright",legend=c(paste("R2 = ",round(rsq,2),sep=''),paste("RMSE = ",round(model_rmse,2))),bty="n")
}

par(mfrow=c(1,2))

clark_pred = 0.722*( (1+0.624)/2 )^kins
plot_clark(  plot_corrs = c(0,clark_pred) ,
             plot_obs = c(0,clark_obs) ,
             plot_labs = c("Unrelated",labs) , title="Clark Estimate\n(assortment = 0.624, hsq = 0.722)" )

plot_clark(  plot_corrs = c(0,corrs) ,
             plot_obs = c(0,clark_obs) ,
             plot_labs = c("Unrelated",labs) , title=paste("Fit from non-genetic model\n(assortment=",AM,", measurement error=",err,")",sep='') )


### Find best parameters for all Clark results

labs = c("Sibling","Sibling-r","Cousin","Cousin2","Cousin3","Cousin4")
kins = c(1,2,3,5,7,9)

clark_obs = matrix(nrow=9,ncol=length(kins))
clark_obs[,1] = c(0.375,0.336,0.269,0.168,0.578,0.522,0.479,0.326,0.431)
clark_obs[,2] = c(0.256,	0.246,	0.172,	0.057,	0.502,	0.380,	0.382,	0.228,	0.273)
clark_obs[,3] = c(0.208,	0.211,	0.144,	0.061,	0.431,	0.299,	0.291,	0.173,	0.234)
clark_obs[,4] = c(0.133,	0.141,	0.073,	0.069,	0.266,	0.234,	0.176,	0.186,	0.200)
clark_obs[,5] = c(0.097,	0.100,	0.052,	0.053,	0.139,	0.174,	0.071,	0.085,	0.150)
clark_obs[,6] = c(0.074,	0.078,	0.028,	0.021,	0.071,	0.066,	0.080,	0.014,	0.107)

clark_measures = c("ModStat 1910–1997",	"House value 1910–1997",	"IMD 1910–1997",	"CoDir 1910–1997",	"OccStat 1780–1859",	"OccStat 1860–1919",	"HighEd 1780–1859",	"HighEd 1860–1919",	"Literacy 1725–1869" )

for ( i in 1:nrow(clark_obs) ) {
  cat("\nOptimizing",clark_measures[i],'\n')

  current_obs = clark_obs[i,]

  # fit the Clark model
  y = log(current_obs)
  x = (kins)
  reg = lm(y~x)
  # compute inferred "assortative mating" and "heritability"
  b = exp(reg$coef[2])
  m = 2*b-1
  hsq = exp(reg$coef[1])

  clark_pred = hsq*( (1+m)/2 )^kins
  rsq = cor( c(0,clark_pred) , c(0,current_obs) )^2
  rmse = sqrt(mean((c(0,clark_pred) - c(0,current_obs))^2))
  cat("Clark model:",rsq,rmse,hsq,m,'\n')

  all.AM = seq(0.2,1,by=0.05)
  all.ENV = seq(0,1,by=0.05)
  all.ENV = 0
  all.err = seq(0,1,by=0.05)

  max_rsq = 0
  min_rmse = 100
  best_param = c(NA,NA,NA,NA)

  for ( AM in all.AM ) {
    for ( ENV in all.ENV ) {
      for ( err in all.err ) {
        corrs = simulate_non_genetic(N,AM,ENV,err)
        rsq = cor( c(0,corrs) , c(0,current_obs) )^2
        rmse = sqrt(mean((c(0,corrs) - c(0,current_obs))^2))
        if ( rmse < min_rmse ) {
          min_rmse = rmse
          best_param = c(rsq,rmse,AM,err)
        }
      }
    }
  }
  cat( "Non-genetic model:", best_param , '\n' )
}

	get_correlated_trait = function(input,r) {
	r * (input) + sqrt(1-r^2) * rnorm(length(input),0,1)
	}
	get_noisy_trait = function(input,noise) {
	sqrt(1-noise) * (input) + sqrt(noise) * rnorm(length(input),0,1)
	}

	simulate_non_genetic = function(N,AM,ENV,err) {
	set.seed(42)

	kins = c(1,2,3,5,7,9)
	corrs = rep(NA,length(kins))

	# simulate individuals with some amount of wealth
	parent_A = rnorm(N,0,1)
	# simulate their partner with correlated wealth
	parent_B = get_correlated_trait( parent_A , AM )
	# measurement error
	m_parent_A = get_noisy_trait(parent_A,err)
	m_parent_B = get_noisy_trait(parent_A,err)

	# simulate siblings
	# generate two offspring who get mean wealth of the parents + noise
	child_A = get_noisy_trait( ((parent_A+parent_B)/2) , ENV )
	child_B = get_noisy_trait( ((parent_A+parent_B)/2) , ENV )
	# measurement error
	m_child_A = get_noisy_trait(child_A,err)
	m_child_B = get_noisy_trait(child_B,err)
	corrs[1] = cor(m_child_A,m_child_B)

	# simulate their partner with correlated wealth
	child_A_partner = get_correlated_trait(child_A,AM)
	child_B_partner = get_correlated_trait(child_B,AM)
	# generate two offspring who get mean wealth of the parents + noise
	child_A_child = get_noisy_trait( ((child_A+child_A_partner)/2) , ENV )
	child_B_child = get_noisy_trait( ((child_B+child_B_partner)/2) , ENV )
	# add measurement error
	m_child_A_child = get_noisy_trait(child_A_child,err)
	m_child_B_child = get_noisy_trait(child_B_child,err)

	# --- sibling-r
	corrs[2] = cor(m_child_A_child,m_child_B)
	# -- cousins
	corrs[3] = cor(m_child_A_child,m_child_B_child)

	# continue for three more generations
	for ( ctr in 4:6 ) {
	child_A = child_A_child
	child_B = child_B_child
	child_A_partner = get_correlated_trait(child_A,AM)
	child_B_partner = get_correlated_trait(child_B,AM)
	# generate two offspring who get mean wealth of the parents + noise
	child_A_child = get_noisy_trait( ((child_A+child_A_partner)/2) , ENV )
	child_B_child = get_noisy_trait( ((child_B+child_B_partner)/2) , ENV )
	# -- cousins-2
	# add measurement error
	m_child_A_child = get_noisy_trait(child_A_child,err)
	m_child_B_child = get_noisy_trait(child_B_child,err)
	corrs[ctr] = cor(m_child_A_child,m_child_B_child)
	}

	corrs
	}


	labs = c("Sibling","Sibling-r","Cousin","Cousin2","Cousin3","Cousin4")
	kins = c(1,2,3,5,7,9)
	# Clark Occ-Stat correlations:
	clark_obs = c(0.578,0.502,0.431,0.266,0.139,0.071)

	# Best fitting parameters
	N = 50000
	AM = 0.55
	ENV = 0
	err = 0.35

	corrs = simulate_non_genetic(N,AM,ENV,err)

	plot_clark = function( plot_corrs , plot_obs , plot_labs , title ) {
	plot(plot_corrs,plot_obs,las=1,cex=1.5,col="yellow",pch=19,xlim=c(0,0.8),ylim=c(0,0.8),xlab="Predicted Correlation",ylab="Observed Correlation",main=title )
	points(plot_corrs,plot_obs,cex=1.5,lwd=2)
	text(plot_corrs,plot_obs,plot_labs,pos=4)

	reg = lm(plot_obs~plot_corrs)
	abline(reg,lty=2,lwd=2)

	rsq = cor(plot_obs,plot_corrs)^2
	model_rmse = sqrt(mean((c(0,plot_corrs) - c(0,plot_obs))^2))
	legend("bottomright",legend=c(paste("R2 = ",round(rsq,2),sep=''),paste("RMSE = ",round(model_rmse,2))),bty="n")
	}

	par(mfrow=c(1,2))

	clark_pred = 0.722*( (1+0.624)/2 )^kins
	plot_clark( plot_corrs = c(0,clark_pred) ,
	plot_obs = c(0,clark_obs) ,
	plot_labs = c("Unrelated",labs) , title="Clark Estimate\n(assortment = 0.624, hsq = 0.722)" )

	plot_clark( plot_corrs = c(0,corrs) ,
	plot_obs = c(0,clark_obs) ,
	plot_labs = c("Unrelated",labs) , title=paste("Fit from non-genetic model\n(assortment=",AM,", measurement error=",err,")",sep='') )


	### Find best parameters for all Clark results

	labs = c("Sibling","Sibling-r","Cousin","Cousin2","Cousin3","Cousin4")
	kins = c(1,2,3,5,7,9)

	clark_obs = matrix(nrow=9,ncol=length(kins))
	clark_obs[,1] = c(0.375,0.336,0.269,0.168,0.578,0.522,0.479,0.326,0.431)
	clark_obs[,2] = c(0.256, 0.246, 0.172, 0.057, 0.502, 0.380, 0.382, 0.228, 0.273)
	clark_obs[,3] = c(0.208, 0.211, 0.144, 0.061, 0.431, 0.299, 0.291, 0.173, 0.234)
	clark_obs[,4] = c(0.133, 0.141, 0.073, 0.069, 0.266, 0.234, 0.176, 0.186, 0.200)
	clark_obs[,5] = c(0.097, 0.100, 0.052, 0.053, 0.139, 0.174, 0.071, 0.085, 0.150)
	clark_obs[,6] = c(0.074, 0.078, 0.028, 0.021, 0.071, 0.066, 0.080, 0.014, 0.107)

	clark_measures = c("ModStat 1910–1997", "House value 1910–1997", "IMD 1910–1997", "CoDir 1910–1997", "OccStat 1780–1859", "OccStat 1860–1919", "HighEd 1780–1859", "HighEd 1860–1919", "Literacy 1725–1869" )

	for ( i in 1:nrow(clark_obs) ) {
	cat("\nOptimizing",clark_measures[i],'\n')

	current_obs = clark_obs[i,]

	# fit the Clark model
	y = log(current_obs)
	x = (kins)
	reg = lm(y~x)
	# compute inferred "assortative mating" and "heritability"
	b = exp(reg$coef[2])
	m = 2*b-1
	hsq = exp(reg$coef[1])

	clark_pred = hsq*( (1+m)/2 )^kins
	rsq = cor( c(0,clark_pred) , c(0,current_obs) )^2
	rmse = sqrt(mean((c(0,clark_pred) - c(0,current_obs))^2))
	cat("Clark model:",rsq,rmse,hsq,m,'\n')

	all.AM = seq(0.2,1,by=0.05)
	all.ENV = seq(0,1,by=0.05)
	all.ENV = 0
	all.err = seq(0,1,by=0.05)

	max_rsq = 0
	min_rmse = 100
	best_param = c(NA,NA,NA,NA)

	for ( AM in all.AM ) {
	for ( ENV in all.ENV ) {
	for ( err in all.err ) {
	corrs = simulate_non_genetic(N,AM,ENV,err)
	rsq = cor( c(0,corrs) , c(0,current_obs) )^2
	rmse = sqrt(mean((c(0,corrs) - c(0,current_obs))^2))
	if ( rmse < min_rmse ) {
	min_rmse = rmse
	best_param = c(rsq,rmse,AM,err)
	}
	}
	}
	}
	cat( "Non-genetic model:", best_param , '\n' )
	}