Last active
May 17, 2022 20:12
-
-
Save cecileane/0894d70161a2b490dbb330b96a29550a to your computer and use it in GitHub Desktop.
Julia script to calculate the D statistics with locus-bootstrapping for significance. used in Blair & Ané (2020, https://doi.org/10.1093/sysbio/syz056)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#= | |
inspired from evobiR::CalcD (https://github.com/coleoguy/evobir/blob/master/R/CalcD.R) | |
used in [Blair & Ané 2020](https://doi.org/10.1093/sysbio/syz056) | |
=# | |
using BioSymbols | |
using BioSequences | |
using FASTX | |
using PhyloNetworks | |
using Random | |
using Statistics | |
using Distributions | |
using DataFrames | |
using CSV | |
""" | |
calcPatternCounts(filename, taxa, nsites_perlocus=nothing; ambig=:D) | |
Take an alignment in a file in fasta format, a vector `taxa` of 4 taxon names, | |
the number of sites at each locus (optionally), and calculate | |
4 vectors for the following number of sites: | |
total, ABBA sites, BABA sites, and BBAA sites. | |
These numbers of sites are calculated by locus, so are stored in a vector | |
for each type of site. If `nsites_perlocus` is not provided, it is assumed | |
that the alignment has a single locus and each vector has a single value. | |
Note that the BBAA sites are *not* used by the D statistic, but are used by | |
some other statistics. So why not get them here. | |
By default, sites with any ambiguity are dropped (`ambig=:D`) before calculating | |
these 3 numbers. Alternatively, they could be resolved (`ambig=:R`) randomly. | |
Otherwise, ambiguous sites are just left alone (e.g. ignored: `ambig=:I`). | |
In that case for instance, `N--N` would count as an ABBA pattern. | |
**Warning**: this is assuming that the fasta file is in *sequential* format, | |
*not interleaved*. More specifically, each taxon name should appear only once. | |
For this one time, the corresponding sequence could be broken across several lines though. | |
Use `reformat_fasta_linear` to reformat an interleaved fasta file. | |
# example | |
```julia | |
shell> cat examples/tinyexample.fasta | |
>P1 | |
ATTGGAA | |
>P5 | |
ATCGGAA | |
>P2 | |
AYCGGGG | |
>P3 | |
ATTGAGG | |
>P4 | |
ATTGA-G | |
julia> file = "examples/tinyexample.fasta"; | |
julia> calcPatternCounts(file, ["P2","P1","P3","typo"]) | |
(missing, missing, missing, missing) | |
julia> calcPatternCounts(file, ["P2","P1","P3","P5"]) | |
([6], [1], [2], [0]) | |
julia> calcPatternCounts(file, ["P2","P1","P3","P5"], [4,3]) # 2 loci: with 4 sites and 3 sites | |
([3, 3], [1, 0], [0, 2], [0, 0]) | |
julia> calcPatternCounts(file, ["P2","P1","P3","P5"], repeat([4],2)) # 2 loci each with 4 sites | |
ERROR: the sum of locus lengths is not the total alignment length | |
julia> calcPatternCounts(file, ["P1","P2","P3","P5"]) # switching taxa 1 & 2 changes abba into baba | |
([6], [2], [1], [0]) | |
julia> calcPatternCounts(file, ["P1","P2","P5","P3"]) # switching taxa 3 & 4 changes abba into baba | |
([6], [1], [2], [0]) | |
julia> calcPatternCounts(file, ["P5","P3","P1","P2"]) # switching 1,2 with 3,4 has no effect | |
([6], [1], [2], [0]) | |
julia> calcPatternCounts(file, ["P1","P2","P3","P4"]) # one more site excluded, gap for P4 | |
([5], [0], [0], [1]) | |
julia> calcPatternCounts(file, ["P1","P3","P2","P4"]) # switching 2 & 3 switches bbaa into baba | |
([5], [0], [1], [0]) | |
julia> calcPatternCounts(file, ["P1","P2","P3","P4"]; ambig=:R) # ambiguous Y resolved | |
([6], [0], [0], [1]) | |
``` | |
""" | |
function calcPatternCounts(file::String, taxa::AbstractVector, | |
nsites_perlocus = nothing; ambig=:D) | |
length(Set(taxa)) == 4 || error("we need 4 distinct taxa for ABBA and BABA patterns") | |
# read alignment and make it a matrix | |
species, alg = PhyloNetworks.readFastaToArray(file) | |
## bug: due to [FASTA.Reader](https://github.com/BioJulia/BioSequences.jl/issues/64) | |
## where ambiguous sites can cause a sequences to be interpreted as AA instead of DNA. | |
## fixed in PhyloNetworks commit 651131a5, and in v0.14.4 whenever that version is released. | |
for i in 1:length(alg) | |
if !(eltype(alg[i]) <: DNA) | |
alg[i] = LongDNA{4}(string(alg[i])) # 4 bits needed for ambiguous sites | |
end | |
end | |
# alg = PhyloNetworks.readfastatodna(file) with next registered version | |
# perhaps do better using BioSequences or BioAlignments directly: | |
# r = FASTA.Reader(open("trial0_concat.fasta")) | |
# check that the 4 taxa are present, and all have same alignment length | |
# sometimes, 1 or more taxa might be missing a gene of interest | |
ind = indexin(taxa, species) # species[ind] in correct order to look at ABBA and BABA | |
# if any of the 4 taxa is missing: return missing values | |
any(isnothing, ind) && return (missing,missing,missing,missing) | |
# alignment for the (sub)set of 4 taxa in the appropriate order | |
alg_4 = view(alg, ind) | |
# check that all 4 sequences have the same # of sites, otherwise return missing values | |
nsites = length(alg_4[1]) | |
if !isnothing(nsites_perlocus) | |
nsites == sum(nsites_perlocus) || | |
error("the sum of locus lengths is not the total alignment length") | |
nloci = length(nsites_perlocus) | |
else | |
nloci = 1 | |
nsites_perlocus = [nsites] | |
end | |
all(x -> length(x)==nsites, alg_4) || return (missing,missing,missing,missing) | |
# check that total length = sum of length of all loci, if provided | |
## deal with ambiguity in sequence data | |
# R: A or G | |
# Y: C or T | |
# S: G or C | |
# W: A or T | |
# K: G or T | |
# M: A or C | |
if ambig in [:D, :R] | |
# D: drop all ambiguous sites | |
# R: resolve, but first drop any site having a 3- or 4-ambiguity | |
target = (BioSymbols.ACGT..., DNA_R, DNA_Y, DNA_S, DNA_W, DNA_K, DNA_M) | |
j = nsites # start from the last site, to keep indices valid after deletion | |
# for j in nsites:-1:1 | |
for locus in nloci:-1:1 | |
for _ in nsites_perlocus[locus]:-1:1 | |
keepj = (ambig==:D ? | |
all(s -> BioSymbols.iscertain(s[j]), alg_4) : | |
all(s -> s[j] in target, alg_4) ) | |
if !keepj | |
for s in alg_4 # delete the site | |
deleteat!(s,j) | |
end | |
nsites_perlocus[locus] -= 1 # decrement # sites in this locus | |
end | |
j -= 1 | |
end | |
end | |
end | |
nsites = length(alg_4[1]) # possibly less than before | |
sum(nsites_perlocus) == nsites || | |
error("bug: after removing uncertain sites, the loci and total length don't match...") | |
## resolve remaining ambigous sites randomly | |
if ambig == :R | |
function resolver(x) | |
if iscertain(x) return x; end | |
if x==DNA_R return rand([DNA_A, DNA_G]); end | |
if x==DNA_Y return rand([DNA_C, DNA_T]); end | |
if x==DNA_S return rand([DNA_G, DNA_C]); end | |
if x==DNA_W return rand([DNA_A, DNA_T]); end | |
if x==DNA_K return rand([DNA_G, DNA_T]); end | |
if x==DNA_M return rand([DNA_A, DNA_C]); | |
else error("weird base: $x"); end | |
end | |
for a in alg_4 | |
for j in 1:nsites | |
isambiguous(a[j]) || continue | |
a[j] = resolver(a[j]) | |
end | |
end | |
end | |
# now calculate the number of sites with pattern ABBA or BABA | |
abba_all = Int[] # all loci | |
baba_all = Int[] | |
bbaa_all = Int[] | |
pattern = Vector{DNA}(undef, 4) # to avoid garbage collection | |
j = 0 # index in the concatenated alignment | |
for locus in 1:nloci | |
abba = 0 | |
baba = 0 | |
bbaa = 0 | |
for _ in 1:nsites_perlocus[locus] | |
j += 1 | |
for i in 1:4 pattern[i] = alg_4[i][j]; end | |
length(unique(pattern)) == 2 || continue # focus on biallelic sites | |
if pattern[1] == pattern[2] | |
if pattern[3] == pattern[4] # else bbba or bbab: do not count | |
bbaa += 1 | |
end | |
continue | |
end | |
# at this point: pattern[1] != pattern[2] so we focus on xy.. patterns | |
pattern[3] != pattern[4] || continue # focus on ..ba patterns | |
if pattern[3] == pattern[1] | |
baba += 1 | |
else # pattern[3] == pattern[2] | |
abba += 1 | |
end | |
end | |
push!(abba_all, abba) | |
push!(baba_all, baba) | |
push!(bbaa_all, bbaa) | |
end | |
return nsites_perlocus,abba_all,baba_all,bbaa_all | |
end | |
""" | |
calcD(x, y) | |
Take a vector `x` of ABBA counts and a vector `y` of BABA counts, assuming that | |
in each list: 1 count corresponds to 1 gene. Then calculate the D statistics for | |
the concatenated data, that is: [sum(ABBA) - sum(BABA)] / [sum(ABBA) + sum(BABA)] | |
# example | |
```julia | |
julia> calcD(20, 5) | |
0.6 | |
julia> calcD([5,4,6,5], [3,0,0,2]) | |
0.6 | |
``` | |
""" | |
function calcD(x, y) | |
sx = sum(x); sy = sum(y) | |
return (sx - sy)/(sx + sy) | |
end | |
""" | |
calcDztest_concatenated(x,y) | |
calcDsd_concatenated(x, y, nbootstrap=5000) | |
Take a number of ABBA sites `x` and number of BABA sites `y` (possibly | |
calculated by `calcPatternCounts` from a concatenated alignment) | |
and test whether the data are consistent with mean D=0. | |
Output: D statistic, its standard deviation, and test p-value (2-tailed). | |
Assumptions: | |
1. independent sites in the single (concatenated) alignment | |
from which `x,y` come from. | |
2. the number of "informative" sites for the ABBA-BABA test, that is, the | |
number of ABBA + BABA sites is considered fixed. In other words, the test | |
conditions on the number of informative sites. | |
Note that D = (x-y)/(x+y) becomes phat - (1-phat) = 2*(phat -0.5) | |
where phat = estimated proportion of ABBA sites among informative sites, | |
that is, among ABBA & BABA sites. | |
Testing "mean D=0" is equivalent to testing p=0.5, which amounts to doing | |
is simple binomial test, or chi-square goodness-of-fit test if all sites | |
are assumed independent. | |
The first version does a z test (equivalent to chi-square goodness-of-fit test). | |
The second version calculates the standard deviation of the D statistics | |
using bootstrap under the null hypothesis that p_abba = p_baba, re-sampling the | |
ABBA-BABA sites only (*not* resampling all sites: see assumption 2 above). | |
Then calculates z = D/sd and pvalue = two-tailed probability beyond z under N(0,1). | |
This second version should be modified to allow for linkage between sites | |
(in which case the z-test becomes invalid). | |
For now, this version is equivalent to the z-test but slower for large counts, | |
and more reliable for small count provided that `nbootstrap` is large. | |
# examples | |
```julia | |
julia> calcDztest_concatenated(20,5) # D=0.6, z=3, p-value=0.0027 | |
(0.6, 3.0000000000000004, 0.002699796063260186) | |
julia> calcDztest_concatenated(5,20) # D=-0.6, same p-value as above | |
(-0.6, -3.0, 0.002699796063260186) | |
julia> calcDsd_concatenated(20, 5, 10000) # same D, similar z & p. | |
(0.6, 3.0081772816666144, 0.0026281977308263323) | |
``` | |
""" | |
function calcDztest_concatenated(x,y) | |
n = x+y | |
D = (x-y)/n | |
phat = x/n | |
z = (phat-0.5)*sqrt(n)*2 # = (phat - p0) / sqrt(p0 * (1-p0) * 1/n) when p0=0.5 | |
pvalue = ccdf(Normal(), abs(z))*2 | |
return D, z, pvalue | |
end | |
function calcDsd_concatenated(x, y, nbootstrap=5000) | |
n = x+y | |
D = (x-y)/n | |
d = Distributions.Binomial(n, 0.5) # null hypothesis for D=0: x=y=n/2 | |
bx = rand(d, nbootstrap) | |
bD = (2*bx .- n) ./ n # y=n-x so x-y = x-(n-x) = 2x-n | |
z = D / Statistics.std(bD) | |
pvalue = ccdf(Normal(), abs(z))*2 | |
return D, z, pvalue | |
end | |
""" | |
calcDsd_byloci(x, y, nbootstrap=5000) | |
Take a list of ABBA counts and a list of BABA counts, like `calcD`. | |
Calculate the standard deviation of the D statistics using bootstrap, | |
re-sampling loci. The sites are *not* resampled: only the loci are. | |
This is assuming independent loci, but possibly linked sites within loci. | |
There is one case of concern: if all resampled loci have no informative | |
(ABBA or BABA) sites. In this case, the D statistic is undefined: 0-0/(0+0) | |
If that happens, the corresponding bootstrap replicate is dropped. | |
`nbootstrap` is the total # of valid bootstrap replicates (not counting | |
dropped reps). | |
Warning: the bootstrap procedure requires a large sample size to be valid. | |
Here, this means a large number of loci. In the example below, we only have | |
4 loci, so the results from this bootstrap cannot be trusted. But 4 loci | |
makes the example run fast. | |
# example | |
```julia | |
julia> abba_4loci = [5,4,6,5]; baba_4loci = [3,0,0,2]; | |
julia> D_value = calcD(abba_4loci, baba_4loci) | |
0.6 | |
julia> D_sd = calcDsd_byloci(abba_4loci, baba_4loci, 10_000) | |
0.17212358417715387 | |
julia> z = D_value/D_sd | |
3.485867453134517 | |
julia> pval = ccdf(Normal(), abs(z))*2 | |
0.0004905439979869396 | |
julia> abba_4loci = [11,0,0,9]; baba_4loci = [3,0,0,2]; | |
julia> D_value = calcD(abba_4loci, baba_4loci) | |
0.6 | |
julia> D_sd = calcDsd_byloci(abba_4loci, baba_4loci, 10_000) | |
0.02462521664116594 | |
julia> calcDsd_byloci([1,2], [1,2,3], 10) | |
ERROR: different loci for ABBA and BABA counts | |
julia> calcDsd_byloci([0,-1,0], [0,0,0], 10) | |
ERROR: some ABBA counts are negative | |
julia> calcDsd_byloci([0,0,0], [0,0,0], 10) | |
ERROR: all ABBA and BABA are 0 | |
``` | |
""" | |
function calcDsd_byloci(x, y, nbootstrap=5000) | |
nloci = length(x) | |
length(y) == nloci || error("different loci for ABBA and BABA counts") | |
bD = Vector{Float64}(undef, nbootstrap) | |
b = 1 | |
all(x .>= 0) || error("some ABBA counts are negative") | |
all(y .>= 0) || error("some BABA counts are negative") | |
any(x .> 0) || any(y .> 0) || error("all ABBA and BABA are 0") | |
while b <= nbootstrap | |
bsx=0; bsy=0 | |
bloci = rand(1:nloci, nloci) # sample loci with replacement | |
for locus in bloci | |
bsx += x[locus] | |
bsy += y[locus] | |
end | |
sum_abbababa = bsx + bsy # none of the loci have any of the ABBA or BABA sites | |
if sum_abbababa ≈ 0 # D would be NaN, then SD would be NaN | |
continue # try again: don't increment b (bootstrap rep) | |
end | |
bD[b] = (bsx - bsy)/sum_abbababa | |
b += 1 | |
end | |
return Statistics.std(bD) | |
end | |
""" | |
reformat_fasta_linear(infile, outfile; sequencetype=LongDNA{4}) | |
Reformat from interleaved (multiple lines and/or multiple records) to sequential | |
format and write to `outputfile`. Error if `outputfile` already exists. | |
Return nothing. | |
# example | |
```julia | |
julia> file = "examples/interleaved.fasta"; | |
julia> reformat_fasta_linear(file, "output.fasta") | |
shell> diff output.fasta examples/sequential.fasta | |
``` | |
""" | |
function reformat_fasta_linear(infile::AbstractString, outfile::AbstractString; | |
sequencetype=LongDNA{4}) | |
ispath(outfile) && error("I don't want to overwrite $outfile") | |
seqs = Array{BioSequences.BioSequence}(undef, 0) | |
taxa = String[] | |
reader = FASTA.Reader(open(infile, "r")) | |
for record in reader | |
seq1 = sequence(sequencetype, record) | |
push!(seqs, seq1) | |
push!(taxa, FASTA.identifier(record)) | |
end | |
close(reader) | |
taxaunique = unique(taxa) | |
writer = FASTA.Writer(open(outfile, "w"), width=-1) | |
for taxon in taxaunique | |
seq1 = sequencetype() # initialize with empty sequence | |
idx = findall(isequal(taxon), taxa) # identify all sequences for this taxon | |
for i in idx seq1 *= seqs[i]; end # concatenate sequences | |
record = FASTA.Record(taxon, seq1) | |
write(writer, record) | |
end | |
close(writer) | |
return nothing | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment