Skip to content

Instantly share code, notes, and snippets.

@cpauvert
Created March 4, 2019 15:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cpauvert/1ba6a97b01ea6cde4398a8d531fa62f9 to your computer and use it in GitHub Desktop.
Save cpauvert/1ba6a97b01ea6cde4398a8d531fa62f9 to your computer and use it in GitHub Desktop.
Filtering out contaminants with control samples (Galan et al. 2016)
# Filtering out contaminants with control samples (Galan et al. 2016)
# Methods: Galan et al. (2016) doi:10.1128/mSystems.00032-16
# Implementation: Charlie Pauvert
# 2018-09-21
# Visualize rapidly the filtered OTU (or ASV) table
viz_matrix<-function(m,axes=T, xlab=NULL, ylab=NULL,...){
mviz<-t(m) > 0
image( mviz[,rev(1:ncol(mviz))], xaxt= "n", yaxt= "n", col = c("white","black"),useRaster = T,...)
if(axes){
axis( 1, at=seq(0,1,length.out=ncol( m ) ), labels= colnames( m ), las= 2 )
axis( 2, at=seq(0,1,length.out=nrow( m ) ), labels= rev(rownames( m )), las= 2)
}
if(!is.null(xlab)) mtext(xlab,side = 1)
if(!is.null(ylab)) mtext(ylab,side = 2)
}
# Run the filters
run_galan<-function(physeq=NULL, ctrl_smp=NULL, positive=NULL, positive_smp=NULL,plot=TRUE){
if(any(is.null(physeq),is.null(ctrl_smp))){
stop("physeq or ctrl_cmp is NULL")
} else {
message("Threshold T_CC : filtering for cross-contamination")
# Find the maximum number of sequences for an ASV in the
# negative and positive controls samples
if(!is.null(positive) & !is.null(positive_smp)){
# Check if positives samples exists and ASV are given
positive<-positive[positive %in% taxa_names(physeq)]
# Fetch only ASV table of the control samples
ctrl<-prune_samples(c(ctrl_smp,positive_smp),physeq)
# # And removed the positive control ASVs
# ctrl<-prune_taxa(!taxa_names(ctrl) %in% positive,ctrl)
} else {
# Fetch only negative control samples
ctrl<-prune_samples(ctrl_smp,physeq)
}
m.ctrl<-as(otu_table( ctrl ), "matrix")
# T_CC threshold for each ASV in the control samples
mar<-ifelse(taxa_are_rows(physeq), 1 , 2)
threshold.ctrl<-apply(m.ctrl,mar,max)
if(!is.null(positive)){
# Positive control ASVs should not be filtered
threshold.ctrl[ names(threshold.ctrl) %in% positive]<-0
}
# Define a function to relevel ASV to 0 according to a ASV wise threshold.
adaptative_threshold<-function(physeq, ASVThreshold){
# Fetch ASV table
ASV<-otu_table(physeq)
# set margin depending on ASV orientation: row or column
taxMargin<-ifelse( taxa_are_rows(ASV), 1, 2)
# remove the number of sequences indicated in ASVThreshold
# by samples for each ASV
ASV.sweeped<-sweep(ASV, taxMargin, ASVThreshold+1, "-")
# any ASV abundance negative was hence below the threshold
# and therefore set to NA
ASV.sweeped[ which(ASV.sweeped < 0) ]<-NA
# Re balance the altered abundance by adding the previously
# removed threshold. Note that NA are not affected
ASV.sweeped<-sweep(ASV.sweeped, taxMargin, ASVThreshold+1,"+")
# ASV that are NA are set to 0 and considered absent.
ASV.sweeped[which(is.na(ASV.sweeped))]<-0
# Return the corrected ASV table
otu_table(physeq)<-otu_table(ASV.sweeped,
taxa_are_rows = taxa_are_rows(ASV))
return(physeq)
}
# Keep the initial ASV table
initM<-as(otu_table( physeq ), "matrix")
# Apply the threshold to the ASV table
physeq<-adaptative_threshold(physeq, threshold.ctrl)
# Compute the number of cells where the filter occurred
finalM<-as(otu_table( physeq ), "matrix")
# Wherever the value is negative, the filter occurred
finalM<-finalM - initM < 0
if(plot){
# Plot the results
(viz_matrix(finalM,axes=F,main="Threshold T_CC : filtering for cross-contamination"))
}
message(paste("Ranges of the T_CC treshold:", min(threshold.ctrl),max(threshold.ctrl)))
if(!is.null(positive) & !is.null(positive_smp)){
message("\n\nThreshold T_FA : filtering out incorrectly assigned sequences.")
positive<-positive[positive %in% taxa_names(physeq)]
# Identification of the maximal number of sequences of "alien" ASV
# assigned to environmental samples
alien.max<-max(otu_table(physeq)[positive,! sample_names(physeq) %in% positive_smp])
# Find which ASV had the maximum values (because the above drops names)
alien.max.otu<-rownames(which(otu_table(physeq)[positive,! sample_names(physeq) %in% positive_smp] == alien.max,arr.ind = T))
message(paste("Max alien sequence number:",alien.max))
message(paste(" identified with the following ASV:",alien.max.otu))
# False-assignment rate R_fa
alien.ratio<-alien.max / sum(taxa_sums(physeq)[alien.max.otu])
message(paste("Alien sequence ratio (R_fa):", scales::percent(alien.ratio)))
# Number of sequences potentially misassigned to their samples for each ASV (
# Threshold T_fa
threshold.alien<-taxa_sums(physeq) * alien.ratio
message(paste("Ranges of the T_FA treshold:", min(threshold.alien),round(max(threshold.alien))))
initM<-as(otu_table( physeq ), "matrix")
# Apply the T_fa threshold
physeq<-adaptative_threshold(physeq, threshold.alien)
# Compute the number of cells where the filter occurred
finalM<-as(otu_table( physeq ), "matrix")
# Wherever the value is negative, the filter occurred
finalM<-finalM - initM < 0
if(plot){
# Plot the results
(viz_matrix(finalM,axes=F,main="Threshold T_FA : filtering out incorrectly assigned sequences."))
}
}
message("\n\nThe removal of the control samples and the control ASV is NOT done within this function!")
message("\n\nPlease cite Galan et al. (2016) doi:10.1128/mSystems.00032-16")
return(physeq)
}
}
#
# Example of use of the function
#
# ASV identifiers of the positive control ASV of the strain.
# marine.strains<-c(rep('Yamadazyma barbieri',3),rep('Candida oceani',2))
# names(marine.strains)<-c("e3b29fe75256a310e2712678e84612826b584728",
# "8dd49297074e132374a68ef5219244b0fc512826",
# "3723dfe3dec2cdebeef00493920371525866ad55",# Yama
# "af9d85218cd86808f881938898ad184945244cb0",
# "e37aa309d50c33ef3c5b5521540f762e06cf0b10")
# Applying Galan filters using two thresholds T_CC and T_FA
#
# run_one_filtered<-run_galan(physeq = run_one,
# ctrl_smp = c("B.poole","T.PCR"),
# positive = names(marine.strains),
# positive_smp = c("TplusPCR", "Tplus.ePCR"))
# Example of output by the function
#
# Threshold T_CC : filtering for cross-contamination
# Ranges of the T_CC treshold: 0 158
#
#
# Threshold T_FA : filtering out incorrectly assigned sequences.
# Max alien sequence number: 310
# identified with the following ASV: e3b29fe75256a310e2712678e84612826b584728
# Alien sequence ratio (R_fa): 0.371%
# Ranges of the T_FA treshold: 0 8946
#
#
# The removal of the control samples and the control ASV is NOT done within this function!
#
#
# Please cite Galan et al. (2016) doi:10.1128/mSystems.00032-16
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment