Skip to content

Instantly share code, notes, and snippets.

@TransGirlCodes
Forked from coleoguy/dstat.R
Last active August 29, 2015 14:13
Show Gist options
  • Save TransGirlCodes/6ee4f52a83a2ff22a73e to your computer and use it in GitHub Desktop.
Save TransGirlCodes/6ee4f52a83a2ff22a73e to your computer and use it in GitHub Desktop.
## ABBA BABA tests
## Heath Blackmon
## coleoguy@gmail.com
## 28 July 2013
## This file has algorithms found in:
## Durand, Eric Y., et al. "Testing for ancient admixture between
## closely related populations." Molecular biology and evolution
## 28.8 (2011): 2239-2252.
## and
## Eaton, D. A. R., and R. H. Ree. 2013. Inferring phylogeny and
## introgression using RADseq data: An example from flowering
## plants (Pedicularis: Orobanchaceae). Syst. Biol. 62:689–706.
## I will use some function from seqinr
library(seqinr)
## First we have eqn. 1 from page 2240 of Durand
## input is a full alignment of four
## sequences the function finds the biallelic SNPs
## that are useful and
## then does the calculation based on eqn.1
CalcD <- function(alignment = "dalignment.fasta", boot=F, replicate=1000){
## First we have eqn. 1 from page 2240 of Durand 2012
## input is an alignment of four sequences it finds
## the biallelic SNPs that are useful and
## then does the calculation
alignment <- read.alignment(alignment, format = "fasta") # read in the alignment
alignment.matrix <- matrix(, length(alignment$nam), nchar(alignment$seq[[1]])) # make a matrix for the alignment
for(i in 1:length(alignment$nam)){
alignment.matrix[i, ] <- unlist(strsplit(alignment$seq[[i]], "")) # fill in the matrix
}
abba <- 0 # set up my variables
baba <- 0 # set up my variables
for(i in 1:ncol(alignment.matrix)){ # run through all sites
if(length(unique(alignment.matrix[, i])) == 2){ # unique(c(p1,p2,p3,o))==2 aka biallelic
if(alignment.matrix[1, i] != alignment.matrix[2, i]){ # p1 != p2 aka different resolutions in p1 and p2
if(alignment.matrix[4, i] != alignment.matrix[3, i]){ # o != p3 durand says "less likely pattern due to seq. errors
if(alignment.matrix[3, i] == alignment.matrix[1, i]) {baba <- baba + 1} # add to the count of baba sites
if(alignment.matrix[2, i] == alignment.matrix[3, i]) {abba <- abba + 1} # add to the count of abba sites
}
}
}
}
d <- (abba - baba) / (abba + baba) #what its all about
## THIS SECTION WILL CALCULATE THE P-VAL BASED ON BOOTSTRAPPING
## SITES ARE SAMPLED WITH REPLACEMENT TO MAKE A NEW DATASET OF
## OF EQUAL SIZE TO THE ORIGINAL DATASET THIS ALLOWS US TO CALCULATE
## THE STANDARD DEVIATION AND THUS A Z SCORE.
if(boot==T){
sim.d<-vector()
foo <- ncol(alignment.matrix)
sim.matrix<-matrix(,4,foo)
for(k in 1:replicate){
for(j in 1:4){
sim.matrix[j,1:foo] <-sample(alignment.matrix[j,1:foo],replace=T)
}
t.abba <- t.baba <- 0 # set up my variables
for(i in 1:ncol(sim.matrix)){ # run through all sites
if(length(unique(sim.matrix[, i])) == 2){ # unique(c(p1,p2,p3,o))==2 aka biallelic
if(sim.matrix[1, i] != sim.matrix[2, i]){ # p1 != p2 aka different resolutions in p1 and p2
if(sim.matrix[4, i] != sim.matrix[3, i]){ # o != p3 durand says "less likely pattern due to seq. errors
if(sim.matrix[3, i] == sim.matrix[1, i]) {t.baba <- t.baba + 1} # add to the count of baba sites
if(sim.matrix[2, i] == sim.matrix[3, i]) {t.abba <- t.abba + 1} # add to the count of abba sites
}
}
}
}
sim.d[k] <- (t.abba - t.baba) / (t.abba + t.baba) #what its all about
}
}
sd.sim.d <- round(sqrt(var(sim.d)),5)
mn.sim.d <- round(mean(sim.d),5)
new.pval <- 2*(pnorm(-abs(d/sd.sim.d)))
## NOW WE MAKE THE OUTPUTS
cat("\nSites in alignment =", ncol(alignment.matrix))
cat("\nNumber of sites with ABBA pattern =", abba)
cat("\nNumber of sites with BABA pattern =", baba)
cat("\n\nD raw statistic / Z-score = ", d, " / ", d/sd.sim.d)
cat("\n\nResults from ", replicate, "bootstraps")
cat("\nSD D statistic =", sd.sim.d)
cat("\nP-value (that D=0) = ",new.pval) #after Eaton and Ree 2013
}
CalcPopD <- function(alignment = "alignment.fasta"){
## Now we have eqn. 2 from page 2240
## input is an alignment the can take multiple sequences from each
## population of interest. IMPORTANT MAKE SURE SEQUENCES ARE IN ORDER
## P1, P2, P3, OUTGROUP! Again we find the biallelic sites but now
## those biallelic sites need not be fixed and we will calculate frequencies
## of SNP for each population. The way the function is set up we do need to
## feed in an alignment where each sequence from a population has the same name:
## pop1
## AACCACAAGCCAGCTCAGCTACAG
## pop1
## TACAACAAGCGAGCTCAGCTACAG
## pop1
## GGCCACAAGCCAGCTCAGCTACAG
## pop2
## GGCCACAAGCCAGCTCAGCTACAG
## pop2
## GGCCACAAGCCAGCTCAGCTACAG
## pop3
## TACCACAAGCCAGCTCAGCTACAG
## OUTGROUP
## TACCAGGAGCCAGCTCTTCTACCC
Mode <- function(x) { # i need a little mode function which R is lacking ugh
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
alignment<-read.alignment(alignment, format="fasta") # read in the alignment
alignment.matrix<-matrix(,length(alignment$nam),nchar(alignment$seq[[1]])+1) # make a matrix for the alignment
for(i in 1:length(alignment$nam)){
alignment.matrix[i,2:ncol(alignment.matrix)]<-unlist(strsplit(alignment$seq[[i]],"")) # fill in the matrix
}
alignment.matrix[,1]<-alignment$nam # get those names into our matrix row names dont work :(
groups<-unique(alignment$nam)
p1 <- p2 <- p3 <- p4 <- 0 # lets just set up the variable names from the durand paper
numerator <- denominator <- 0
useful<-0 # plus some of my own
segregating<-0 # plus some of my own
seg.pos<-F # plus some of my own
for(i in 2:ncol(alignment.matrix)){ # run through all sites
seg.pos<-F # reset this switch
if(length(unique(alignment.matrix[,i]))==2){ # unique(c(p1,p2,p3,o))==2 aka biallelic
A <- Mode(alignment.matrix[alignment.matrix[, 1] == groups[4], i]) # lets treat the more common variant in the outgroup as "A"
B <- unique(alignment.matrix[,i])[unique(alignment.matrix[, i]) != A] # not purposely obfuscating... the other variant in variable "B"
if(B %in% unique(alignment.matrix[alignment.matrix[, 1] == groups[3], i])){ # makes sure that we have at least some indication of an ABBA/BABA pattern
if(length(unique(alignment.matrix[alignment.matrix[, 1] %in% groups[1:2], i])) == 2){ # makes sure that we've got some different resolutions in the ingroups
useful <- useful + 1 # lets just keep track of how many sites are even useful
if(length(unique(alignment.matrix[alignment.matrix[, 1] == groups[1], i])) == 2) {seg.pos<-T}# next 5 lines are a lame way of counting sites that are segregating
if(length(unique(alignment.matrix[alignment.matrix[, 1] == groups[2], i])) == 2) {seg.pos<-T}# vs those that are fixed another words is population sampling
if(length(unique(alignment.matrix[alignment.matrix[, 1] == groups[3], i])) == 2) {seg.pos<-T}# really of any value within the data set that we are examining
if(length(unique(alignment.matrix[alignment.matrix[, 1] == groups[4], i])) == 2) {seg.pos<-T}
if(seg.pos == T){segregating <- segregating + 1}
#print(segregating)
p1 <- (sum(alignment.matrix[alignment.matrix[, 1] == groups[1], i] == A))/length(alignment.matrix[alignment.matrix[, 1] == groups[1], i]) # freq of A snp in first population
p2 <- (sum(alignment.matrix[alignment.matrix[, 1] == groups[2], i] == A))/length(alignment.matrix[alignment.matrix[, 1] == groups[2], i]) # freq of A snp in second population
p3 <- (sum(alignment.matrix[alignment.matrix[, 1] == groups[3], i] == A))/length(alignment.matrix[alignment.matrix[, 1] == groups[3], i]) # freq of A snp in third population
p4 <- (sum(alignment.matrix[alignment.matrix[, 1] == groups[4], i] == A))/length(alignment.matrix[alignment.matrix[, 1] == groups[4], i]) # freq of A snp in outgroup population
# Durands explanation of eqn 2 is lacking... as least to my feable mind!
# it appears to me that as written p hat is actually the frequency of SNP "B" so....
# snap... vindicated my interpretation matches that found in the supplemental material of the
# heliconius genome paper supplement... too cool
p1 <- 1-p1 #convert these over from proportion A to proportion B
p2 <- 1-p2 #convert these over from proportion A to proportion B
p3 <- 1-p3 #convert these over from proportion A to proportion B
p4 <- 1-p4 #convert these over from proportion A to proportion B
numerator <- ((1 - p1) * p2 * p3 * (1 - p4)) - (p1 * (1 - p2) * p3 * (1 - p4)) + numerator # build up our numerator sum
denominator <- ((1 - p1) * p2 * p3 * (1 - p4)) + (p1 * (1 - p2) * p3 * (1 - p4)) + denominator # build up our denominator sum
}
}
}
}
d <- numerator / denominator #what its all about
user.result <- list()
user.result$d.stat <- d
user.result$pval <- "HELP"
user.result$align.length <- ncol(alignment.matrix) - 1
user.result$useful.sites <- useful
user.result$seg.sites <- segregating
print(paste("Sites in alignment =", ncol(alignment.matrix) - 1))
print(paste("Number of sites with ABBA or BABA patterns =", useful))
print(paste("Number of ABBA or BABA sites that are still segregating in at least one population =", segregating))
print(paste("D statistic =", d))
}
## This next function is from:
## Eaton, D. A. R., and R. H. Ree. 2013. Inferring phylogeny and introgression using RADseq data:
## An example from flowering plants (Pedicularis: Orobanchaceae). Syst. Biol. 62:689–706.
## input is a full alignment of five OTUs
## the function finds the biallelic SNPs that are useful and
## then does the calculations
CalcPartD <- function(alignment = "alignment.fasta", boot=F, replicate = 1000, alpha =.05){
alignment <- read.alignment(alignment, format = "fasta") # read in the alignment
alignment.matrix <- matrix(, length(alignment$nam), nchar(alignment$seq[[1]])) # make a matrix for the alignment
for(i in 1:length(alignment$nam)){
alignment.matrix[i, ] <- unlist(strsplit(alignment$seq[[i]], "")) # fill in the matrix
}
abbaa <- babaa <- 0 ## d1 # set up my variables
ababa <- baaba <- 0 ## d2 # set up my variables
abbba <- babba <- 0 ## d12
for(i in 1:ncol(alignment.matrix)){ # run through all sites
if(length(unique(alignment.matrix[, i])) == 2){ # unique(c(p1,p2,p3.1,p3.2,O))==2 aka biallelic
if(alignment.matrix[1, i] != alignment.matrix[2, i]){ # p1 != p2 aka different resolutions in p1 and p2
if(alignment.matrix[5, i] != alignment.matrix[3, i] | alignment.matrix[5, i] != alignment.matrix[4, i] ){# o != p3.1 or o is !=p3.2
## D1
if(alignment.matrix[4, i] == alignment.matrix[5, i]){
if(alignment.matrix[1, i] == alignment.matrix[5, i]){abbaa <- abbaa+1}
if(alignment.matrix[2, i] == alignment.matrix[5, i]){babaa <- babaa+1}
}
## D2
if(alignment.matrix[3, i] == alignment.matrix[5, i]){
if(alignment.matrix[1, i] == alignment.matrix[5, i]){ababa <- ababa+1}
if(alignment.matrix[2, i] == alignment.matrix[5, i]){baaba <- baaba+1}
}
##D12
if(alignment.matrix[3, i] == alignment.matrix[4, i]){
if(alignment.matrix[1, i] == alignment.matrix[5, i]){abbba <- abbba+1}
if(alignment.matrix[2, i] == alignment.matrix[5, i]){babba <- babba+1}
}
}
}
}
}
d1 <- (abbaa - babaa) / (abbaa + babaa)
d2 <- (ababa - baaba) / (ababa + baaba)
d12 <- (abbba - babba) / (abbba + babba)
## THIS SECTION WILL CALCULATE THE P-VAL BASED ON BOOTSTRAPPING
## SITES ARE SAMPLED WITH REPLACEMENT TO MAKE A NEW DATASET OF
## OF EQUAL SIZE TO THE ORIGINAL DATASET THIS ALLOWS US TO CALCULATE
## THE STANDARD DEVIATION AND THUS A Z SCORE.
if(boot==T){
sim.d1<-vector()
sim.d2<-vector()
sim.d12<-vector()
foo <- ncol(alignment.matrix)
sim.matrix<-matrix(,5,foo)
for(k in 1:replicate){
for(j in 1:5){sim.matrix[j,1:foo] <-sample(alignment.matrix[j,1:foo],replace=T)}
##NOW JUST RERUN OUR WHOLE ALGORITHM
t.abbaa <- t.babaa <- 0 ## d1
t.ababa <- t.baaba <- 0 ## d2 # set up my variables
t.abbba <- t.babba <- 0 ## d12
for(i in 1:ncol(sim.matrix)){ # run through all sites
if(length(unique(sim.matrix[, i])) == 2){ # unique(c(p1,p2,p3.1,p3.2,O))==2 aka biallelic
if(sim.matrix[1, i] != sim.matrix[2, i]){ # p1 != p2 aka different resolutions in p1 and p2
if(sim.matrix[5, i] != sim.matrix[3, i] | sim.matrix[5, i] != sim.matrix[4, i] ){ # o != p3.1 or o is !=p3.2
## D1
if(sim.matrix[4, i] == sim.matrix[5, i]){
if(sim.matrix[1, i] == sim.matrix[5, i]){t.abbaa <- t.abbaa+1}
if(sim.matrix[2, i] == sim.matrix[5, i]){t.babaa <- t.babaa+1}
}
## D2
if(sim.matrix[3, i] == sim.matrix[5, i]){
if(sim.matrix[1, i] == sim.matrix[5, i]){t.ababa <- t.ababa+1}
if(sim.matrix[2, i] == sim.matrix[5, i]){t.baaba <- t.baaba+1}
}
##D12
if(sim.matrix[3, i] == sim.matrix[4, i]){
if(sim.matrix[1, i] == sim.matrix[5, i]){t.abbba <- t.abbba+1}
if(sim.matrix[2, i] == sim.matrix[5, i]){t.babba <- t.babba+1}
}
}
}
}
}
sim.d1[k] <- (t.abbaa - t.babaa) / (t.abbaa + t.babaa)
sim.d2[k] <- (t.ababa - t.baaba) / (t.ababa + t.baaba)
sim.d12[k] <- (t.abbba - t.babba) / (t.abbba + t.babba)
}
sd.sim.d1 <- round(sqrt(var(sim.d1)),5)
mn.sim.d1 <- round(mean(sim.d1),5)
new.pval.d1 <- 2*(pnorm(-abs(d1/sd.sim.d1)))
sd.sim.d2 <- round(sqrt(var(sim.d2)),5)
mn.sim.d2 <- round(mean(sim.d2),5)
new.pval.d2 <- 2*(pnorm(-abs(d2/sd.sim.d2)))
sd.sim.d12 <- round(sqrt(var(sim.d12)),5)
mn.sim.d12 <- round(mean(sim.d12),5)
new.pval.d12 <- 2*(pnorm(-abs(d12/sd.sim.d12)))
}
if(is.nan(d1)) d1<- "Error Missing Data"
if(is.nan(d2)) d2<- "Error Missing Data"
if(is.nan(d12)) d12<- "Error Missing Data"
## NOW WE MAKE THE OUTPUTS
cat("Sites in alignment =", ncol(alignment.matrix))
cat("\nD1 sites with ABBAA/BABAA pattern =", abbaa,"/",babaa)
cat("\nD2 sites with ABABA/BAABA pattern =", ababa,"/",baaba)
cat("\nD12 sites with ABABA/BAABA pattern =", abbba,"/",babba)
cat("\n\nD1 raw statistic / Z-score =", d1,"/",d1/sd.sim.d1)
cat("\nD2 raw statistic / Z-score =", d2,"/",d2/sd.sim.d2)
cat("\nD12 raw statistic / Z-score =", d12,"/",d12/sd.sim.d12)
if(boot==T){
if(!(d1=="Error Missing Data")){
cat("\n\nD1 Bootstrap Statistics: ")
cat("SD = ", sd.sim.d1)
cat(" P-val = ", new.pval.d1)
}
if(!(d2=="Error Missing Data")){
cat("\nD2 Bootstrap Statistics: ")
cat("SD = ", sd.sim.d2)
cat(" P-val = ", new.pval.d2)
}
if(!(d12=="Error Missing Data")){
cat("\nD12 Bootstrap Statistics: ")
cat("SD = ", sd.sim.d12)
cat(" P-val = ", new.pval.d12)
}
cat("\n\nBonferroni adjustment: alpha selected:",alpha," number of tests:",3,"\nSo P-value of less than ",round(alpha/3,4)," should be considered significant", sep="")
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment