Created
March 4, 2019 15:39
-
-
Save cpauvert/1ba6a97b01ea6cde4398a8d531fa62f9 to your computer and use it in GitHub Desktop.
Filtering out contaminants with control samples (Galan et al. 2016)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Filtering out contaminants with control samples (Galan et al. 2016) | |
# Methods: Galan et al. (2016) doi:10.1128/mSystems.00032-16 | |
# Implementation: Charlie Pauvert | |
# 2018-09-21 | |
# Visualize rapidly the filtered OTU (or ASV) table | |
viz_matrix<-function(m,axes=T, xlab=NULL, ylab=NULL,...){ | |
mviz<-t(m) > 0 | |
image( mviz[,rev(1:ncol(mviz))], xaxt= "n", yaxt= "n", col = c("white","black"),useRaster = T,...) | |
if(axes){ | |
axis( 1, at=seq(0,1,length.out=ncol( m ) ), labels= colnames( m ), las= 2 ) | |
axis( 2, at=seq(0,1,length.out=nrow( m ) ), labels= rev(rownames( m )), las= 2) | |
} | |
if(!is.null(xlab)) mtext(xlab,side = 1) | |
if(!is.null(ylab)) mtext(ylab,side = 2) | |
} | |
# Run the filters | |
run_galan<-function(physeq=NULL, ctrl_smp=NULL, positive=NULL, positive_smp=NULL,plot=TRUE){ | |
if(any(is.null(physeq),is.null(ctrl_smp))){ | |
stop("physeq or ctrl_cmp is NULL") | |
} else { | |
message("Threshold T_CC : filtering for cross-contamination") | |
# Find the maximum number of sequences for an ASV in the | |
# negative and positive controls samples | |
if(!is.null(positive) & !is.null(positive_smp)){ | |
# Check if positives samples exists and ASV are given | |
positive<-positive[positive %in% taxa_names(physeq)] | |
# Fetch only ASV table of the control samples | |
ctrl<-prune_samples(c(ctrl_smp,positive_smp),physeq) | |
# # And removed the positive control ASVs | |
# ctrl<-prune_taxa(!taxa_names(ctrl) %in% positive,ctrl) | |
} else { | |
# Fetch only negative control samples | |
ctrl<-prune_samples(ctrl_smp,physeq) | |
} | |
m.ctrl<-as(otu_table( ctrl ), "matrix") | |
# T_CC threshold for each ASV in the control samples | |
mar<-ifelse(taxa_are_rows(physeq), 1 , 2) | |
threshold.ctrl<-apply(m.ctrl,mar,max) | |
if(!is.null(positive)){ | |
# Positive control ASVs should not be filtered | |
threshold.ctrl[ names(threshold.ctrl) %in% positive]<-0 | |
} | |
# Define a function to relevel ASV to 0 according to a ASV wise threshold. | |
adaptative_threshold<-function(physeq, ASVThreshold){ | |
# Fetch ASV table | |
ASV<-otu_table(physeq) | |
# set margin depending on ASV orientation: row or column | |
taxMargin<-ifelse( taxa_are_rows(ASV), 1, 2) | |
# remove the number of sequences indicated in ASVThreshold | |
# by samples for each ASV | |
ASV.sweeped<-sweep(ASV, taxMargin, ASVThreshold+1, "-") | |
# any ASV abundance negative was hence below the threshold | |
# and therefore set to NA | |
ASV.sweeped[ which(ASV.sweeped < 0) ]<-NA | |
# Re balance the altered abundance by adding the previously | |
# removed threshold. Note that NA are not affected | |
ASV.sweeped<-sweep(ASV.sweeped, taxMargin, ASVThreshold+1,"+") | |
# ASV that are NA are set to 0 and considered absent. | |
ASV.sweeped[which(is.na(ASV.sweeped))]<-0 | |
# Return the corrected ASV table | |
otu_table(physeq)<-otu_table(ASV.sweeped, | |
taxa_are_rows = taxa_are_rows(ASV)) | |
return(physeq) | |
} | |
# Keep the initial ASV table | |
initM<-as(otu_table( physeq ), "matrix") | |
# Apply the threshold to the ASV table | |
physeq<-adaptative_threshold(physeq, threshold.ctrl) | |
# Compute the number of cells where the filter occurred | |
finalM<-as(otu_table( physeq ), "matrix") | |
# Wherever the value is negative, the filter occurred | |
finalM<-finalM - initM < 0 | |
if(plot){ | |
# Plot the results | |
(viz_matrix(finalM,axes=F,main="Threshold T_CC : filtering for cross-contamination")) | |
} | |
message(paste("Ranges of the T_CC treshold:", min(threshold.ctrl),max(threshold.ctrl))) | |
if(!is.null(positive) & !is.null(positive_smp)){ | |
message("\n\nThreshold T_FA : filtering out incorrectly assigned sequences.") | |
positive<-positive[positive %in% taxa_names(physeq)] | |
# Identification of the maximal number of sequences of "alien" ASV | |
# assigned to environmental samples | |
alien.max<-max(otu_table(physeq)[positive,! sample_names(physeq) %in% positive_smp]) | |
# Find which ASV had the maximum values (because the above drops names) | |
alien.max.otu<-rownames(which(otu_table(physeq)[positive,! sample_names(physeq) %in% positive_smp] == alien.max,arr.ind = T)) | |
message(paste("Max alien sequence number:",alien.max)) | |
message(paste(" identified with the following ASV:",alien.max.otu)) | |
# False-assignment rate R_fa | |
alien.ratio<-alien.max / sum(taxa_sums(physeq)[alien.max.otu]) | |
message(paste("Alien sequence ratio (R_fa):", scales::percent(alien.ratio))) | |
# Number of sequences potentially misassigned to their samples for each ASV ( | |
# Threshold T_fa | |
threshold.alien<-taxa_sums(physeq) * alien.ratio | |
message(paste("Ranges of the T_FA treshold:", min(threshold.alien),round(max(threshold.alien)))) | |
initM<-as(otu_table( physeq ), "matrix") | |
# Apply the T_fa threshold | |
physeq<-adaptative_threshold(physeq, threshold.alien) | |
# Compute the number of cells where the filter occurred | |
finalM<-as(otu_table( physeq ), "matrix") | |
# Wherever the value is negative, the filter occurred | |
finalM<-finalM - initM < 0 | |
if(plot){ | |
# Plot the results | |
(viz_matrix(finalM,axes=F,main="Threshold T_FA : filtering out incorrectly assigned sequences.")) | |
} | |
} | |
message("\n\nThe removal of the control samples and the control ASV is NOT done within this function!") | |
message("\n\nPlease cite Galan et al. (2016) doi:10.1128/mSystems.00032-16") | |
return(physeq) | |
} | |
} | |
# | |
# Example of use of the function | |
# | |
# ASV identifiers of the positive control ASV of the strain. | |
# marine.strains<-c(rep('Yamadazyma barbieri',3),rep('Candida oceani',2)) | |
# names(marine.strains)<-c("e3b29fe75256a310e2712678e84612826b584728", | |
# "8dd49297074e132374a68ef5219244b0fc512826", | |
# "3723dfe3dec2cdebeef00493920371525866ad55",# Yama | |
# "af9d85218cd86808f881938898ad184945244cb0", | |
# "e37aa309d50c33ef3c5b5521540f762e06cf0b10") | |
# Applying Galan filters using two thresholds T_CC and T_FA | |
# | |
# run_one_filtered<-run_galan(physeq = run_one, | |
# ctrl_smp = c("B.poole","T.PCR"), | |
# positive = names(marine.strains), | |
# positive_smp = c("TplusPCR", "Tplus.ePCR")) | |
# Example of output by the function | |
# | |
# Threshold T_CC : filtering for cross-contamination | |
# Ranges of the T_CC treshold: 0 158 | |
# | |
# | |
# Threshold T_FA : filtering out incorrectly assigned sequences. | |
# Max alien sequence number: 310 | |
# identified with the following ASV: e3b29fe75256a310e2712678e84612826b584728 | |
# Alien sequence ratio (R_fa): 0.371% | |
# Ranges of the T_FA treshold: 0 8946 | |
# | |
# | |
# The removal of the control samples and the control ASV is NOT done within this function! | |
# | |
# | |
# Please cite Galan et al. (2016) doi:10.1128/mSystems.00032-16 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment