TristanLefebure/sampling_pnps.R

## sampling_pnps.R
#R script
#April 12 2020, TLefebure

#Do we generate an artificial correlation by regressing Pi_s and Pi_N / Pi_S estimated on the same dataset?

#A function to sample polymorphic site on a Ps gradient
#The Pi_N / Pi_S is stable, but Pi_s varies
#Very too naive, also these are not true Pi estimates (nucleotide diversity),
#more like theta_watterson (number of segregatting sites)

sample.Ps.Pn <- function(PnOverPs=0.1) {

	#Pick some random true Ps from 0.001 to 0.1
	#Ps <- runif(100, min=0, max=0.1)
	Ps <- seq(0.001,0.1,0.001)
	nsim <- length(Ps)

	#Let's fix the Pn/Ps at 10% (default)
	Pn <- PnOverPs * Ps

	#Let's say we have a 2E7 coding genome size (20k genes x 1kb)
	#a third are synonymous sites, 2/3 non-syn sites
	GS = 2E7
	synSites <- round(1/3 * GS)
	nonsynSites <- GS - synSites

	#Let's say that we are going to sample about 2.5% of it (sequencing about 500 genes: 1kb*500genes sites)
	sampleSize = 1000*500
	#again 1/3 and 2/3 syn and non-syn sites, respectively
	sampleSizeSyn <- round(sampleSize * 1/3)
	sampleSizeNonSyn <- sampleSize - sampleSizeSyn


	#How many polymorphism are we going to actually see ?
	#Using a Poisson process with lambda = sampleSize * P
	lambdasSyn <- sampleSizeSyn * Ps
	lambdasNonSyn <- sampleSizeNonSyn * Pn

	#sample number of syn polym (S) and non-syn polym (N)
	#(in fact these are the number of segregating sites, not pi)
	S <- integer(length=nsim)
	N <- integer(length=nsim)
	for(i in 1:nsim) {
		S[i] <- rpois(1, lambdasSyn[i])
		N[i] <- rpois(1, lambdasNonSyn[i])
	}

	#Calculate observed Ps and Pn/Ps
	Ps.o <- S/sampleSizeSyn
	Ps.n <- N/sampleSizeNonSyn
	PnOverPs.o <- Ps.n/Ps.o
	return(list(Ps.o=Ps.o, Ps.o=Ps.n, PnOverPs.o=PnOverPs.o))
}


#One example with PnOverPs=0.1
s1 <- sample.Ps.Pn()
plot(PnOverPs.o ~ Ps.o,
	data=s1, xlab="Estimated Ps",
	ylab="Estimated Pn/Ps",
	main="Surdispersion of Pn/Ps for small Ps")
abline(h=0.1, lty=2)


#redo this 1000 times
sX <- lapply(1:1000, function(x) sample.Ps.Pn(PnOverPs=0.1))

sX.lm <- lapply(sX, function(x) lm(x$PnOverPs.o ~ x$Ps.o))
sX.r2 <- sapply(sX.lm, function(x) summary(x)$adj.r.squared)
sX.pv <- sapply(sX.lm, function(x) summary(x)$coefficients[2,4])

hist(sX.r2, n=50, xlab="R^2", main="1000 random samples")
abline(v=0, lty=2)

hist(sX.pv, breaks=1/0.05, xlab="P-value", main="1000 random samples")
sum(sX.pv < 0.05)
## -> an excess of significant tests (~3X) with mild r2
	#R script
	#April 12 2020, TLefebure

	#Do we generate an artificial correlation by regressing Pi_s and Pi_N / Pi_S estimated on the same dataset?

	#A function to sample polymorphic site on a Ps gradient
	#The Pi_N / Pi_S is stable, but Pi_s varies
	#Very too naive, also these are not true Pi estimates (nucleotide diversity),
	#more like theta_watterson (number of segregatting sites)

	sample.Ps.Pn <- function(PnOverPs=0.1) {

	#Pick some random true Ps from 0.001 to 0.1
	#Ps <- runif(100, min=0, max=0.1)
	Ps <- seq(0.001,0.1,0.001)
	nsim <- length(Ps)

	#Let's fix the Pn/Ps at 10% (default)
	Pn <- PnOverPs * Ps

	#Let's say we have a 2E7 coding genome size (20k genes x 1kb)
	#a third are synonymous sites, 2/3 non-syn sites
	GS = 2E7
	synSites <- round(1/3 * GS)
	nonsynSites <- GS - synSites

	#Let's say that we are going to sample about 2.5% of it (sequencing about 500 genes: 1kb*500genes sites)
	sampleSize = 1000*500
	#again 1/3 and 2/3 syn and non-syn sites, respectively
	sampleSizeSyn <- round(sampleSize * 1/3)
	sampleSizeNonSyn <- sampleSize - sampleSizeSyn


	#How many polymorphism are we going to actually see ?
	#Using a Poisson process with lambda = sampleSize * P
	lambdasSyn <- sampleSizeSyn * Ps
	lambdasNonSyn <- sampleSizeNonSyn * Pn

	#sample number of syn polym (S) and non-syn polym (N)
	#(in fact these are the number of segregating sites, not pi)
	S <- integer(length=nsim)
	N <- integer(length=nsim)
	for(i in 1:nsim) {
	S[i] <- rpois(1, lambdasSyn[i])
	N[i] <- rpois(1, lambdasNonSyn[i])
	}

	#Calculate observed Ps and Pn/Ps
	Ps.o <- S/sampleSizeSyn
	Ps.n <- N/sampleSizeNonSyn
	PnOverPs.o <- Ps.n/Ps.o
	return(list(Ps.o=Ps.o, Ps.o=Ps.n, PnOverPs.o=PnOverPs.o))
	}


	#One example with PnOverPs=0.1
	s1 <- sample.Ps.Pn()
	plot(PnOverPs.o ~ Ps.o,
	data=s1, xlab="Estimated Ps",
	ylab="Estimated Pn/Ps",
	main="Surdispersion of Pn/Ps for small Ps")
	abline(h=0.1, lty=2)



	#redo this 1000 times
	sX <- lapply(1:1000, function(x) sample.Ps.Pn(PnOverPs=0.1))

	sX.lm <- lapply(sX, function(x) lm(x$PnOverPs.o ~ x$Ps.o))
	sX.r2 <- sapply(sX.lm, function(x) summary(x)$adj.r.squared)
	sX.pv <- sapply(sX.lm, function(x) summary(x)$coefficients[2,4])

	hist(sX.r2, n=50, xlab="R^2", main="1000 random samples")
	abline(v=0, lty=2)

	hist(sX.pv, breaks=1/0.05, xlab="P-value", main="1000 random samples")
	sum(sX.pv < 0.05)
	## -> an excess of significant tests (~3X) with mild r2