Last active
June 22, 2018 20:53
-
-
Save coleoguy/6886683 to your computer and use it in GitHub Desktop.
This file contains functions implementing ABBA BABA type tests for gene flow between populations. The three functions included are CalcD CalcPartD and CalcPopD.
CalcD: This is the original version of the ABBA BABA test and calculates the D-statistic as found in Durand et al 2011 page 2240
CalcPopD: This version of the test is able to use many se…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## ABBA BABA tests | |
## Heath Blackmon | |
## coleoguy@gmail.com | |
## 28 July 2013 | |
## This file has algorithms found in: | |
## Durand, Eric Y., et al. "Testing for ancient admixture between | |
## closely related populations." Molecular biology and evolution | |
## 28.8 (2011): 2239-2252. | |
## and | |
## Eaton, D. A. R., and R. H. Ree. 2013. Inferring phylogeny and | |
## introgression using RADseq data: An example from flowering | |
## plants (Pedicularis: Orobanchaceae). Syst. Biol. 62:689–706. | |
## I will use some function from seqinr | |
library(seqinr) | |
## First we have eqn. 1 from page 2240 of Durand | |
## input is a full alignment of four | |
## sequences the function finds the biallelic SNPs | |
## that are useful and | |
## then does the calculation based on eqn.1 | |
CalcD <- function(alignment = "dalignment.fasta", boot=F, replicate=1000){ | |
## First we have eqn. 1 from page 2240 of Durand 2012 | |
## input is an alignment of four sequences it finds | |
## the biallelic SNPs that are useful and | |
## then does the calculation | |
alignment <- read.alignment(alignment, format = "fasta") # read in the alignment | |
alignment.matrix <- matrix(, length(alignment$nam), nchar(alignment$seq[[1]])) # make a matrix for the alignment | |
for(i in 1:length(alignment$nam)){ | |
alignment.matrix[i, ] <- unlist(strsplit(alignment$seq[[i]], "")) # fill in the matrix | |
} | |
abba <- 0 # set up my variables | |
baba <- 0 # set up my variables | |
for(i in 1:ncol(alignment.matrix)){ # run through all sites | |
if(length(unique(alignment.matrix[, i])) == 2){ # unique(c(p1,p2,p3,o))==2 aka biallelic | |
if(alignment.matrix[1, i] != alignment.matrix[2, i]){ # p1 != p2 aka different resolutions in p1 and p2 | |
if(alignment.matrix[4, i] != alignment.matrix[3, i]){ # o != p3 durand says "less likely pattern due to seq. errors | |
if(alignment.matrix[3, i] == alignment.matrix[1, i]) {baba <- baba + 1} # add to the count of baba sites | |
if(alignment.matrix[2, i] == alignment.matrix[3, i]) {abba <- abba + 1} # add to the count of abba sites | |
} | |
} | |
} | |
} | |
d <- (abba - baba) / (abba + baba) #what its all about | |
## THIS SECTION WILL CALCULATE THE P-VAL BASED ON BOOTSTRAPPING | |
## SITES ARE SAMPLED WITH REPLACEMENT TO MAKE A NEW DATASET OF | |
## OF EQUAL SIZE TO THE ORIGINAL DATASET THIS ALLOWS US TO CALCULATE | |
## THE STANDARD DEVIATION AND THUS A Z SCORE. | |
if(boot==T){ | |
sim.d<-vector() | |
foo <- ncol(alignment.matrix) | |
sim.matrix<-matrix(,4,foo) | |
for(k in 1:replicate){ | |
for(j in 1:4){ | |
sim.matrix[j,1:foo] <-sample(alignment.matrix[j,1:foo],replace=T) | |
} | |
t.abba <- t.baba <- 0 # set up my variables | |
for(i in 1:ncol(sim.matrix)){ # run through all sites | |
if(length(unique(sim.matrix[, i])) == 2){ # unique(c(p1,p2,p3,o))==2 aka biallelic | |
if(sim.matrix[1, i] != sim.matrix[2, i]){ # p1 != p2 aka different resolutions in p1 and p2 | |
if(sim.matrix[4, i] != sim.matrix[3, i]){ # o != p3 durand says "less likely pattern due to seq. errors | |
if(sim.matrix[3, i] == sim.matrix[1, i]) {t.baba <- t.baba + 1} # add to the count of baba sites | |
if(sim.matrix[2, i] == sim.matrix[3, i]) {t.abba <- t.abba + 1} # add to the count of abba sites | |
} | |
} | |
} | |
} | |
sim.d[k] <- (t.abba - t.baba) / (t.abba + t.baba) #what its all about | |
} | |
} | |
sd.sim.d <- round(sqrt(var(sim.d)),5) | |
mn.sim.d <- round(mean(sim.d),5) | |
new.pval <- 2*(pnorm(-abs(d/sd.sim.d))) | |
## NOW WE MAKE THE OUTPUTS | |
cat("\nSites in alignment =", ncol(alignment.matrix)) | |
cat("\nNumber of sites with ABBA pattern =", abba) | |
cat("\nNumber of sites with BABA pattern =", baba) | |
cat("\n\nD raw statistic / Z-score = ", d, " / ", d/sd.sim.d) | |
cat("\n\nResults from ", replicate, "bootstraps") | |
cat("\nSD D statistic =", sd.sim.d) | |
cat("\nP-value (that D=0) = ",new.pval) #after Eaton and Ree 2013 | |
} | |
CalcPopD <- function(alignment = "alignment.fasta"){ | |
## Now we have eqn. 2 from page 2240 | |
## input is an alignment the can take multiple sequences from each | |
## population of interest. IMPORTANT MAKE SURE SEQUENCES ARE IN ORDER | |
## P1, P2, P3, OUTGROUP! Again we find the biallelic sites but now | |
## those biallelic sites need not be fixed and we will calculate frequencies | |
## of SNP for each population. The way the function is set up we do need to | |
## feed in an alignment where each sequence from a population has the same name: | |
## pop1 | |
## AACCACAAGCCAGCTCAGCTACAG | |
## pop1 | |
## TACAACAAGCGAGCTCAGCTACAG | |
## pop1 | |
## GGCCACAAGCCAGCTCAGCTACAG | |
## pop2 | |
## GGCCACAAGCCAGCTCAGCTACAG | |
## pop2 | |
## GGCCACAAGCCAGCTCAGCTACAG | |
## pop3 | |
## TACCACAAGCCAGCTCAGCTACAG | |
## OUTGROUP | |
## TACCAGGAGCCAGCTCTTCTACCC | |
Mode <- function(x) { # i need a little mode function which R is lacking ugh | |
ux <- unique(x) | |
ux[which.max(tabulate(match(x, ux)))] | |
} | |
alignment<-read.alignment(alignment, format="fasta") # read in the alignment | |
alignment.matrix<-matrix(,length(alignment$nam),nchar(alignment$seq[[1]])+1) # make a matrix for the alignment | |
for(i in 1:length(alignment$nam)){ | |
alignment.matrix[i,2:ncol(alignment.matrix)]<-unlist(strsplit(alignment$seq[[i]],"")) # fill in the matrix | |
} | |
alignment.matrix[,1]<-alignment$nam # get those names into our matrix row names dont work :( | |
groups<-unique(alignment$nam) | |
p1 <- p2 <- p3 <- p4 <- 0 # lets just set up the variable names from the durand paper | |
numerator <- denominator <- 0 | |
useful<-0 # plus some of my own | |
segregating<-0 # plus some of my own | |
seg.pos<-F # plus some of my own | |
for(i in 2:ncol(alignment.matrix)){ # run through all sites | |
seg.pos<-F # reset this switch | |
if(length(unique(alignment.matrix[,i]))==2){ # unique(c(p1,p2,p3,o))==2 aka biallelic | |
A <- Mode(alignment.matrix[alignment.matrix[, 1] == groups[4], i]) # lets treat the more common variant in the outgroup as "A" | |
B <- unique(alignment.matrix[,i])[unique(alignment.matrix[, i]) != A] # not purposely obfuscating... the other variant in variable "B" | |
if(B %in% unique(alignment.matrix[alignment.matrix[, 1] == groups[3], i])){ # makes sure that we have at least some indication of an ABBA/BABA pattern | |
if(length(unique(alignment.matrix[alignment.matrix[, 1] %in% groups[1:2], i])) == 2){ # makes sure that we've got some different resolutions in the ingroups | |
useful <- useful + 1 # lets just keep track of how many sites are even useful | |
if(length(unique(alignment.matrix[alignment.matrix[, 1] == groups[1], i])) == 2) {seg.pos<-T}# next 5 lines are a lame way of counting sites that are segregating | |
if(length(unique(alignment.matrix[alignment.matrix[, 1] == groups[2], i])) == 2) {seg.pos<-T}# vs those that are fixed another words is population sampling | |
if(length(unique(alignment.matrix[alignment.matrix[, 1] == groups[3], i])) == 2) {seg.pos<-T}# really of any value within the data set that we are examining | |
if(length(unique(alignment.matrix[alignment.matrix[, 1] == groups[4], i])) == 2) {seg.pos<-T} | |
if(seg.pos == T){segregating <- segregating + 1} | |
#print(segregating) | |
p1 <- (sum(alignment.matrix[alignment.matrix[, 1] == groups[1], i] == A))/length(alignment.matrix[alignment.matrix[, 1] == groups[1], i]) # freq of A snp in first population | |
p2 <- (sum(alignment.matrix[alignment.matrix[, 1] == groups[2], i] == A))/length(alignment.matrix[alignment.matrix[, 1] == groups[2], i]) # freq of A snp in second population | |
p3 <- (sum(alignment.matrix[alignment.matrix[, 1] == groups[3], i] == A))/length(alignment.matrix[alignment.matrix[, 1] == groups[3], i]) # freq of A snp in third population | |
p4 <- (sum(alignment.matrix[alignment.matrix[, 1] == groups[4], i] == A))/length(alignment.matrix[alignment.matrix[, 1] == groups[4], i]) # freq of A snp in outgroup population | |
# Durands explanation of eqn 2 is lacking... as least to my feable mind! | |
# it appears to me that as written p hat is actually the frequency of SNP "B" so.... | |
# snap... vindicated my interpretation matches that found in the supplemental material of the | |
# heliconius genome paper supplement... too cool | |
p1 <- 1-p1 #convert these over from proportion A to proportion B | |
p2 <- 1-p2 #convert these over from proportion A to proportion B | |
p3 <- 1-p3 #convert these over from proportion A to proportion B | |
p4 <- 1-p4 #convert these over from proportion A to proportion B | |
numerator <- ((1 - p1) * p2 * p3 * (1 - p4)) - (p1 * (1 - p2) * p3 * (1 - p4)) + numerator # build up our numerator sum | |
denominator <- ((1 - p1) * p2 * p3 * (1 - p4)) + (p1 * (1 - p2) * p3 * (1 - p4)) + denominator # build up our denominator sum | |
} | |
} | |
} | |
} | |
d <- numerator / denominator #what its all about | |
user.result <- list() | |
user.result$d.stat <- d | |
user.result$pval <- "HELP" | |
user.result$align.length <- ncol(alignment.matrix) - 1 | |
user.result$useful.sites <- useful | |
user.result$seg.sites <- segregating | |
print(paste("Sites in alignment =", ncol(alignment.matrix) - 1)) | |
print(paste("Number of sites with ABBA or BABA patterns =", useful)) | |
print(paste("Number of ABBA or BABA sites that are still segregating in at least one population =", segregating)) | |
print(paste("D statistic =", d)) | |
} | |
## This next function is from: | |
## Eaton, D. A. R., and R. H. Ree. 2013. Inferring phylogeny and introgression using RADseq data: | |
## An example from flowering plants (Pedicularis: Orobanchaceae). Syst. Biol. 62:689–706. | |
## input is a full alignment of five OTUs | |
## the function finds the biallelic SNPs that are useful and | |
## then does the calculations | |
CalcPartD <- function(alignment = "alignment.fasta", boot=F, replicate = 1000, alpha =.05){ | |
alignment <- read.alignment(alignment, format = "fasta") # read in the alignment | |
alignment.matrix <- matrix(, length(alignment$nam), nchar(alignment$seq[[1]])) # make a matrix for the alignment | |
for(i in 1:length(alignment$nam)){ | |
alignment.matrix[i, ] <- unlist(strsplit(alignment$seq[[i]], "")) # fill in the matrix | |
} | |
abbaa <- babaa <- 0 ## d1 # set up my variables | |
ababa <- baaba <- 0 ## d2 # set up my variables | |
abbba <- babba <- 0 ## d12 | |
for(i in 1:ncol(alignment.matrix)){ # run through all sites | |
if(length(unique(alignment.matrix[, i])) == 2){ # unique(c(p1,p2,p3.1,p3.2,O))==2 aka biallelic | |
if(alignment.matrix[1, i] != alignment.matrix[2, i]){ # p1 != p2 aka different resolutions in p1 and p2 | |
if(alignment.matrix[5, i] != alignment.matrix[3, i] | alignment.matrix[5, i] != alignment.matrix[4, i] ){# o != p3.1 or o is !=p3.2 | |
## D1 | |
if(alignment.matrix[4, i] == alignment.matrix[5, i]){ | |
if(alignment.matrix[1, i] == alignment.matrix[5, i]){abbaa <- abbaa+1} | |
if(alignment.matrix[2, i] == alignment.matrix[5, i]){babaa <- babaa+1} | |
} | |
## D2 | |
if(alignment.matrix[3, i] == alignment.matrix[5, i]){ | |
if(alignment.matrix[1, i] == alignment.matrix[5, i]){ababa <- ababa+1} | |
if(alignment.matrix[2, i] == alignment.matrix[5, i]){baaba <- baaba+1} | |
} | |
##D12 | |
if(alignment.matrix[3, i] == alignment.matrix[4, i]){ | |
if(alignment.matrix[1, i] == alignment.matrix[5, i]){abbba <- abbba+1} | |
if(alignment.matrix[2, i] == alignment.matrix[5, i]){babba <- babba+1} | |
} | |
} | |
} | |
} | |
} | |
d1 <- (abbaa - babaa) / (abbaa + babaa) | |
d2 <- (ababa - baaba) / (ababa + baaba) | |
d12 <- (abbba - babba) / (abbba + babba) | |
## THIS SECTION WILL CALCULATE THE P-VAL BASED ON BOOTSTRAPPING | |
## SITES ARE SAMPLED WITH REPLACEMENT TO MAKE A NEW DATASET OF | |
## OF EQUAL SIZE TO THE ORIGINAL DATASET THIS ALLOWS US TO CALCULATE | |
## THE STANDARD DEVIATION AND THUS A Z SCORE. | |
if(boot==T){ | |
sim.d1<-vector() | |
sim.d2<-vector() | |
sim.d12<-vector() | |
foo <- ncol(alignment.matrix) | |
sim.matrix<-matrix(,5,foo) | |
for(k in 1:replicate){ | |
for(j in 1:5){sim.matrix[j,1:foo] <-sample(alignment.matrix[j,1:foo],replace=T)} | |
##NOW JUST RERUN OUR WHOLE ALGORITHM | |
t.abbaa <- t.babaa <- 0 ## d1 | |
t.ababa <- t.baaba <- 0 ## d2 # set up my variables | |
t.abbba <- t.babba <- 0 ## d12 | |
for(i in 1:ncol(sim.matrix)){ # run through all sites | |
if(length(unique(sim.matrix[, i])) == 2){ # unique(c(p1,p2,p3.1,p3.2,O))==2 aka biallelic | |
if(sim.matrix[1, i] != sim.matrix[2, i]){ # p1 != p2 aka different resolutions in p1 and p2 | |
if(sim.matrix[5, i] != sim.matrix[3, i] | sim.matrix[5, i] != sim.matrix[4, i] ){ # o != p3.1 or o is !=p3.2 | |
## D1 | |
if(sim.matrix[4, i] == sim.matrix[5, i]){ | |
if(sim.matrix[1, i] == sim.matrix[5, i]){t.abbaa <- t.abbaa+1} | |
if(sim.matrix[2, i] == sim.matrix[5, i]){t.babaa <- t.babaa+1} | |
} | |
## D2 | |
if(sim.matrix[3, i] == sim.matrix[5, i]){ | |
if(sim.matrix[1, i] == sim.matrix[5, i]){t.ababa <- t.ababa+1} | |
if(sim.matrix[2, i] == sim.matrix[5, i]){t.baaba <- t.baaba+1} | |
} | |
##D12 | |
if(sim.matrix[3, i] == sim.matrix[4, i]){ | |
if(sim.matrix[1, i] == sim.matrix[5, i]){t.abbba <- t.abbba+1} | |
if(sim.matrix[2, i] == sim.matrix[5, i]){t.babba <- t.babba+1} | |
} | |
} | |
} | |
} | |
} | |
sim.d1[k] <- (t.abbaa - t.babaa) / (t.abbaa + t.babaa) | |
sim.d2[k] <- (t.ababa - t.baaba) / (t.ababa + t.baaba) | |
sim.d12[k] <- (t.abbba - t.babba) / (t.abbba + t.babba) | |
} | |
sd.sim.d1 <- round(sqrt(var(sim.d1)),5) | |
mn.sim.d1 <- round(mean(sim.d1),5) | |
new.pval.d1 <- 2*(pnorm(-abs(d1/sd.sim.d1))) | |
sd.sim.d2 <- round(sqrt(var(sim.d2)),5) | |
mn.sim.d2 <- round(mean(sim.d2),5) | |
new.pval.d2 <- 2*(pnorm(-abs(d2/sd.sim.d2))) | |
sd.sim.d12 <- round(sqrt(var(sim.d12)),5) | |
mn.sim.d12 <- round(mean(sim.d12),5) | |
new.pval.d12 <- 2*(pnorm(-abs(d12/sd.sim.d12))) | |
} | |
if(is.nan(d1)) d1<- "Error Missing Data" | |
if(is.nan(d2)) d2<- "Error Missing Data" | |
if(is.nan(d12)) d12<- "Error Missing Data" | |
## NOW WE MAKE THE OUTPUTS | |
cat("Sites in alignment =", ncol(alignment.matrix)) | |
cat("\nD1 sites with ABBAA/BABAA pattern =", abbaa,"/",babaa) | |
cat("\nD2 sites with ABABA/BAABA pattern =", ababa,"/",baaba) | |
cat("\nD12 sites with ABABA/BAABA pattern =", abbba,"/",babba) | |
cat("\n\nD1 raw statistic / Z-score =", d1,"/",d1/sd.sim.d1) | |
cat("\nD2 raw statistic / Z-score =", d2,"/",d2/sd.sim.d2) | |
cat("\nD12 raw statistic / Z-score =", d12,"/",d12/sd.sim.d12) | |
if(boot==T){ | |
if(!(d1=="Error Missing Data")){ | |
cat("\n\nD1 Bootstrap Statistics: ") | |
cat("SD = ", sd.sim.d1) | |
cat(" P-val = ", new.pval.d1) | |
} | |
if(!(d2=="Error Missing Data")){ | |
cat("\nD2 Bootstrap Statistics: ") | |
cat("SD = ", sd.sim.d2) | |
cat(" P-val = ", new.pval.d2) | |
} | |
if(!(d12=="Error Missing Data")){ | |
cat("\nD12 Bootstrap Statistics: ") | |
cat("SD = ", sd.sim.d12) | |
cat(" P-val = ", new.pval.d12) | |
} | |
cat("\n\nBonferroni adjustment: alpha selected:",alpha," number of tests:",3,"\nSo P-value of less than ",round(alpha/3,4)," should be considered significant", sep="") | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment