Skip to content

Instantly share code, notes, and snippets.

@cecileane
Last active May 17, 2022 20:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cecileane/0894d70161a2b490dbb330b96a29550a to your computer and use it in GitHub Desktop.
Save cecileane/0894d70161a2b490dbb330b96a29550a to your computer and use it in GitHub Desktop.
Julia script to calculate the D statistics with locus-bootstrapping for significance. used in Blair & Ané (2020, https://doi.org/10.1093/sysbio/syz056)
#=
inspired from evobiR::CalcD (https://github.com/coleoguy/evobir/blob/master/R/CalcD.R)
used in [Blair & Ané 2020](https://doi.org/10.1093/sysbio/syz056)
=#
using BioSymbols
using BioSequences
using FASTX
using PhyloNetworks
using Random
using Statistics
using Distributions
using DataFrames
using CSV
"""
calcPatternCounts(filename, taxa, nsites_perlocus=nothing; ambig=:D)
Take an alignment in a file in fasta format, a vector `taxa` of 4 taxon names,
the number of sites at each locus (optionally), and calculate
4 vectors for the following number of sites:
total, ABBA sites, BABA sites, and BBAA sites.
These numbers of sites are calculated by locus, so are stored in a vector
for each type of site. If `nsites_perlocus` is not provided, it is assumed
that the alignment has a single locus and each vector has a single value.
Note that the BBAA sites are *not* used by the D statistic, but are used by
some other statistics. So why not get them here.
By default, sites with any ambiguity are dropped (`ambig=:D`) before calculating
these 3 numbers. Alternatively, they could be resolved (`ambig=:R`) randomly.
Otherwise, ambiguous sites are just left alone (e.g. ignored: `ambig=:I`).
In that case for instance, `N--N` would count as an ABBA pattern.
**Warning**: this is assuming that the fasta file is in *sequential* format,
*not interleaved*. More specifically, each taxon name should appear only once.
For this one time, the corresponding sequence could be broken across several lines though.
Use `reformat_fasta_linear` to reformat an interleaved fasta file.
# example
```julia
shell> cat examples/tinyexample.fasta
>P1
ATTGGAA
>P5
ATCGGAA
>P2
AYCGGGG
>P3
ATTGAGG
>P4
ATTGA-G
julia> file = "examples/tinyexample.fasta";
julia> calcPatternCounts(file, ["P2","P1","P3","typo"])
(missing, missing, missing, missing)
julia> calcPatternCounts(file, ["P2","P1","P3","P5"])
([6], [1], [2], [0])
julia> calcPatternCounts(file, ["P2","P1","P3","P5"], [4,3]) # 2 loci: with 4 sites and 3 sites
([3, 3], [1, 0], [0, 2], [0, 0])
julia> calcPatternCounts(file, ["P2","P1","P3","P5"], repeat([4],2)) # 2 loci each with 4 sites
ERROR: the sum of locus lengths is not the total alignment length
julia> calcPatternCounts(file, ["P1","P2","P3","P5"]) # switching taxa 1 & 2 changes abba into baba
([6], [2], [1], [0])
julia> calcPatternCounts(file, ["P1","P2","P5","P3"]) # switching taxa 3 & 4 changes abba into baba
([6], [1], [2], [0])
julia> calcPatternCounts(file, ["P5","P3","P1","P2"]) # switching 1,2 with 3,4 has no effect
([6], [1], [2], [0])
julia> calcPatternCounts(file, ["P1","P2","P3","P4"]) # one more site excluded, gap for P4
([5], [0], [0], [1])
julia> calcPatternCounts(file, ["P1","P3","P2","P4"]) # switching 2 & 3 switches bbaa into baba
([5], [0], [1], [0])
julia> calcPatternCounts(file, ["P1","P2","P3","P4"]; ambig=:R) # ambiguous Y resolved
([6], [0], [0], [1])
```
"""
function calcPatternCounts(file::String, taxa::AbstractVector,
nsites_perlocus = nothing; ambig=:D)
length(Set(taxa)) == 4 || error("we need 4 distinct taxa for ABBA and BABA patterns")
# read alignment and make it a matrix
species, alg = PhyloNetworks.readFastaToArray(file)
## bug: due to [FASTA.Reader](https://github.com/BioJulia/BioSequences.jl/issues/64)
## where ambiguous sites can cause a sequences to be interpreted as AA instead of DNA.
## fixed in PhyloNetworks commit 651131a5, and in v0.14.4 whenever that version is released.
for i in 1:length(alg)
if !(eltype(alg[i]) <: DNA)
alg[i] = LongDNA{4}(string(alg[i])) # 4 bits needed for ambiguous sites
end
end
# alg = PhyloNetworks.readfastatodna(file) with next registered version
# perhaps do better using BioSequences or BioAlignments directly:
# r = FASTA.Reader(open("trial0_concat.fasta"))
# check that the 4 taxa are present, and all have same alignment length
# sometimes, 1 or more taxa might be missing a gene of interest
ind = indexin(taxa, species) # species[ind] in correct order to look at ABBA and BABA
# if any of the 4 taxa is missing: return missing values
any(isnothing, ind) && return (missing,missing,missing,missing)
# alignment for the (sub)set of 4 taxa in the appropriate order
alg_4 = view(alg, ind)
# check that all 4 sequences have the same # of sites, otherwise return missing values
nsites = length(alg_4[1])
if !isnothing(nsites_perlocus)
nsites == sum(nsites_perlocus) ||
error("the sum of locus lengths is not the total alignment length")
nloci = length(nsites_perlocus)
else
nloci = 1
nsites_perlocus = [nsites]
end
all(x -> length(x)==nsites, alg_4) || return (missing,missing,missing,missing)
# check that total length = sum of length of all loci, if provided
## deal with ambiguity in sequence data
# R: A or G
# Y: C or T
# S: G or C
# W: A or T
# K: G or T
# M: A or C
if ambig in [:D, :R]
# D: drop all ambiguous sites
# R: resolve, but first drop any site having a 3- or 4-ambiguity
target = (BioSymbols.ACGT..., DNA_R, DNA_Y, DNA_S, DNA_W, DNA_K, DNA_M)
j = nsites # start from the last site, to keep indices valid after deletion
# for j in nsites:-1:1
for locus in nloci:-1:1
for _ in nsites_perlocus[locus]:-1:1
keepj = (ambig==:D ?
all(s -> BioSymbols.iscertain(s[j]), alg_4) :
all(s -> s[j] in target, alg_4) )
if !keepj
for s in alg_4 # delete the site
deleteat!(s,j)
end
nsites_perlocus[locus] -= 1 # decrement # sites in this locus
end
j -= 1
end
end
end
nsites = length(alg_4[1]) # possibly less than before
sum(nsites_perlocus) == nsites ||
error("bug: after removing uncertain sites, the loci and total length don't match...")
## resolve remaining ambigous sites randomly
if ambig == :R
function resolver(x)
if iscertain(x) return x; end
if x==DNA_R return rand([DNA_A, DNA_G]); end
if x==DNA_Y return rand([DNA_C, DNA_T]); end
if x==DNA_S return rand([DNA_G, DNA_C]); end
if x==DNA_W return rand([DNA_A, DNA_T]); end
if x==DNA_K return rand([DNA_G, DNA_T]); end
if x==DNA_M return rand([DNA_A, DNA_C]);
else error("weird base: $x"); end
end
for a in alg_4
for j in 1:nsites
isambiguous(a[j]) || continue
a[j] = resolver(a[j])
end
end
end
# now calculate the number of sites with pattern ABBA or BABA
abba_all = Int[] # all loci
baba_all = Int[]
bbaa_all = Int[]
pattern = Vector{DNA}(undef, 4) # to avoid garbage collection
j = 0 # index in the concatenated alignment
for locus in 1:nloci
abba = 0
baba = 0
bbaa = 0
for _ in 1:nsites_perlocus[locus]
j += 1
for i in 1:4 pattern[i] = alg_4[i][j]; end
length(unique(pattern)) == 2 || continue # focus on biallelic sites
if pattern[1] == pattern[2]
if pattern[3] == pattern[4] # else bbba or bbab: do not count
bbaa += 1
end
continue
end
# at this point: pattern[1] != pattern[2] so we focus on xy.. patterns
pattern[3] != pattern[4] || continue # focus on ..ba patterns
if pattern[3] == pattern[1]
baba += 1
else # pattern[3] == pattern[2]
abba += 1
end
end
push!(abba_all, abba)
push!(baba_all, baba)
push!(bbaa_all, bbaa)
end
return nsites_perlocus,abba_all,baba_all,bbaa_all
end
"""
calcD(x, y)
Take a vector `x` of ABBA counts and a vector `y` of BABA counts, assuming that
in each list: 1 count corresponds to 1 gene. Then calculate the D statistics for
the concatenated data, that is: [sum(ABBA) - sum(BABA)] / [sum(ABBA) + sum(BABA)]
# example
```julia
julia> calcD(20, 5)
0.6
julia> calcD([5,4,6,5], [3,0,0,2])
0.6
```
"""
function calcD(x, y)
sx = sum(x); sy = sum(y)
return (sx - sy)/(sx + sy)
end
"""
calcDztest_concatenated(x,y)
calcDsd_concatenated(x, y, nbootstrap=5000)
Take a number of ABBA sites `x` and number of BABA sites `y` (possibly
calculated by `calcPatternCounts` from a concatenated alignment)
and test whether the data are consistent with mean D=0.
Output: D statistic, its standard deviation, and test p-value (2-tailed).
Assumptions:
1. independent sites in the single (concatenated) alignment
from which `x,y` come from.
2. the number of "informative" sites for the ABBA-BABA test, that is, the
number of ABBA + BABA sites is considered fixed. In other words, the test
conditions on the number of informative sites.
Note that D = (x-y)/(x+y) becomes phat - (1-phat) = 2*(phat -0.5)
where phat = estimated proportion of ABBA sites among informative sites,
that is, among ABBA & BABA sites.
Testing "mean D=0" is equivalent to testing p=0.5, which amounts to doing
is simple binomial test, or chi-square goodness-of-fit test if all sites
are assumed independent.
The first version does a z test (equivalent to chi-square goodness-of-fit test).
The second version calculates the standard deviation of the D statistics
using bootstrap under the null hypothesis that p_abba = p_baba, re-sampling the
ABBA-BABA sites only (*not* resampling all sites: see assumption 2 above).
Then calculates z = D/sd and pvalue = two-tailed probability beyond z under N(0,1).
This second version should be modified to allow for linkage between sites
(in which case the z-test becomes invalid).
For now, this version is equivalent to the z-test but slower for large counts,
and more reliable for small count provided that `nbootstrap` is large.
# examples
```julia
julia> calcDztest_concatenated(20,5) # D=0.6, z=3, p-value=0.0027
(0.6, 3.0000000000000004, 0.002699796063260186)
julia> calcDztest_concatenated(5,20) # D=-0.6, same p-value as above
(-0.6, -3.0, 0.002699796063260186)
julia> calcDsd_concatenated(20, 5, 10000) # same D, similar z & p.
(0.6, 3.0081772816666144, 0.0026281977308263323)
```
"""
function calcDztest_concatenated(x,y)
n = x+y
D = (x-y)/n
phat = x/n
z = (phat-0.5)*sqrt(n)*2 # = (phat - p0) / sqrt(p0 * (1-p0) * 1/n) when p0=0.5
pvalue = ccdf(Normal(), abs(z))*2
return D, z, pvalue
end
function calcDsd_concatenated(x, y, nbootstrap=5000)
n = x+y
D = (x-y)/n
d = Distributions.Binomial(n, 0.5) # null hypothesis for D=0: x=y=n/2
bx = rand(d, nbootstrap)
bD = (2*bx .- n) ./ n # y=n-x so x-y = x-(n-x) = 2x-n
z = D / Statistics.std(bD)
pvalue = ccdf(Normal(), abs(z))*2
return D, z, pvalue
end
"""
calcDsd_byloci(x, y, nbootstrap=5000)
Take a list of ABBA counts and a list of BABA counts, like `calcD`.
Calculate the standard deviation of the D statistics using bootstrap,
re-sampling loci. The sites are *not* resampled: only the loci are.
This is assuming independent loci, but possibly linked sites within loci.
There is one case of concern: if all resampled loci have no informative
(ABBA or BABA) sites. In this case, the D statistic is undefined: 0-0/(0+0)
If that happens, the corresponding bootstrap replicate is dropped.
`nbootstrap` is the total # of valid bootstrap replicates (not counting
dropped reps).
Warning: the bootstrap procedure requires a large sample size to be valid.
Here, this means a large number of loci. In the example below, we only have
4 loci, so the results from this bootstrap cannot be trusted. But 4 loci
makes the example run fast.
# example
```julia
julia> abba_4loci = [5,4,6,5]; baba_4loci = [3,0,0,2];
julia> D_value = calcD(abba_4loci, baba_4loci)
0.6
julia> D_sd = calcDsd_byloci(abba_4loci, baba_4loci, 10_000)
0.17212358417715387
julia> z = D_value/D_sd
3.485867453134517
julia> pval = ccdf(Normal(), abs(z))*2
0.0004905439979869396
julia> abba_4loci = [11,0,0,9]; baba_4loci = [3,0,0,2];
julia> D_value = calcD(abba_4loci, baba_4loci)
0.6
julia> D_sd = calcDsd_byloci(abba_4loci, baba_4loci, 10_000)
0.02462521664116594
julia> calcDsd_byloci([1,2], [1,2,3], 10)
ERROR: different loci for ABBA and BABA counts
julia> calcDsd_byloci([0,-1,0], [0,0,0], 10)
ERROR: some ABBA counts are negative
julia> calcDsd_byloci([0,0,0], [0,0,0], 10)
ERROR: all ABBA and BABA are 0
```
"""
function calcDsd_byloci(x, y, nbootstrap=5000)
nloci = length(x)
length(y) == nloci || error("different loci for ABBA and BABA counts")
bD = Vector{Float64}(undef, nbootstrap)
b = 1
all(x .>= 0) || error("some ABBA counts are negative")
all(y .>= 0) || error("some BABA counts are negative")
any(x .> 0) || any(y .> 0) || error("all ABBA and BABA are 0")
while b <= nbootstrap
bsx=0; bsy=0
bloci = rand(1:nloci, nloci) # sample loci with replacement
for locus in bloci
bsx += x[locus]
bsy += y[locus]
end
sum_abbababa = bsx + bsy # none of the loci have any of the ABBA or BABA sites
if sum_abbababa ≈ 0 # D would be NaN, then SD would be NaN
continue # try again: don't increment b (bootstrap rep)
end
bD[b] = (bsx - bsy)/sum_abbababa
b += 1
end
return Statistics.std(bD)
end
"""
reformat_fasta_linear(infile, outfile; sequencetype=LongDNA{4})
Reformat from interleaved (multiple lines and/or multiple records) to sequential
format and write to `outputfile`. Error if `outputfile` already exists.
Return nothing.
# example
```julia
julia> file = "examples/interleaved.fasta";
julia> reformat_fasta_linear(file, "output.fasta")
shell> diff output.fasta examples/sequential.fasta
```
"""
function reformat_fasta_linear(infile::AbstractString, outfile::AbstractString;
sequencetype=LongDNA{4})
ispath(outfile) && error("I don't want to overwrite $outfile")
seqs = Array{BioSequences.BioSequence}(undef, 0)
taxa = String[]
reader = FASTA.Reader(open(infile, "r"))
for record in reader
seq1 = sequence(sequencetype, record)
push!(seqs, seq1)
push!(taxa, FASTA.identifier(record))
end
close(reader)
taxaunique = unique(taxa)
writer = FASTA.Writer(open(outfile, "w"), width=-1)
for taxon in taxaunique
seq1 = sequencetype() # initialize with empty sequence
idx = findall(isequal(taxon), taxa) # identify all sequences for this taxon
for i in idx seq1 *= seqs[i]; end # concatenate sequences
record = FASTA.Record(taxon, seq1)
write(writer, record)
end
close(writer)
return nothing
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment