Created
April 12, 2020 16:36
-
-
Save TristanLefebure/7bec3cad6cd588cb755a735e7f372c28 to your computer and use it in GitHub Desktop.
artificial correlation by regressing Pi_s and Pi_N / Pi_S?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#R script | |
#April 12 2020, TLefebure | |
#Do we generate an artificial correlation by regressing Pi_s and Pi_N / Pi_S estimated on the same dataset? | |
#A function to sample polymorphic site on a Ps gradient | |
#The Pi_N / Pi_S is stable, but Pi_s varies | |
#Very too naive, also these are not true Pi estimates (nucleotide diversity), | |
#more like theta_watterson (number of segregatting sites) | |
sample.Ps.Pn <- function(PnOverPs=0.1) { | |
#Pick some random true Ps from 0.001 to 0.1 | |
#Ps <- runif(100, min=0, max=0.1) | |
Ps <- seq(0.001,0.1,0.001) | |
nsim <- length(Ps) | |
#Let's fix the Pn/Ps at 10% (default) | |
Pn <- PnOverPs * Ps | |
#Let's say we have a 2E7 coding genome size (20k genes x 1kb) | |
#a third are synonymous sites, 2/3 non-syn sites | |
GS = 2E7 | |
synSites <- round(1/3 * GS) | |
nonsynSites <- GS - synSites | |
#Let's say that we are going to sample about 2.5% of it (sequencing about 500 genes: 1kb*500genes sites) | |
sampleSize = 1000*500 | |
#again 1/3 and 2/3 syn and non-syn sites, respectively | |
sampleSizeSyn <- round(sampleSize * 1/3) | |
sampleSizeNonSyn <- sampleSize - sampleSizeSyn | |
#How many polymorphism are we going to actually see ? | |
#Using a Poisson process with lambda = sampleSize * P | |
lambdasSyn <- sampleSizeSyn * Ps | |
lambdasNonSyn <- sampleSizeNonSyn * Pn | |
#sample number of syn polym (S) and non-syn polym (N) | |
#(in fact these are the number of segregating sites, not pi) | |
S <- integer(length=nsim) | |
N <- integer(length=nsim) | |
for(i in 1:nsim) { | |
S[i] <- rpois(1, lambdasSyn[i]) | |
N[i] <- rpois(1, lambdasNonSyn[i]) | |
} | |
#Calculate observed Ps and Pn/Ps | |
Ps.o <- S/sampleSizeSyn | |
Ps.n <- N/sampleSizeNonSyn | |
PnOverPs.o <- Ps.n/Ps.o | |
return(list(Ps.o=Ps.o, Ps.o=Ps.n, PnOverPs.o=PnOverPs.o)) | |
} | |
#One example with PnOverPs=0.1 | |
s1 <- sample.Ps.Pn() | |
plot(PnOverPs.o ~ Ps.o, | |
data=s1, xlab="Estimated Ps", | |
ylab="Estimated Pn/Ps", | |
main="Surdispersion of Pn/Ps for small Ps") | |
abline(h=0.1, lty=2) | |
#redo this 1000 times | |
sX <- lapply(1:1000, function(x) sample.Ps.Pn(PnOverPs=0.1)) | |
sX.lm <- lapply(sX, function(x) lm(x$PnOverPs.o ~ x$Ps.o)) | |
sX.r2 <- sapply(sX.lm, function(x) summary(x)$adj.r.squared) | |
sX.pv <- sapply(sX.lm, function(x) summary(x)$coefficients[2,4]) | |
hist(sX.r2, n=50, xlab="R^2", main="1000 random samples") | |
abline(v=0, lty=2) | |
hist(sX.pv, breaks=1/0.05, xlab="P-value", main="1000 random samples") | |
sum(sX.pv < 0.05) | |
## -> an excess of significant tests (~3X) with mild r2 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment