Skip to content

Instantly share code, notes, and snippets.

@gregcaporaso
Created December 3, 2012 20:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gregcaporaso/4197809 to your computer and use it in GitHub Desktop.
Save gregcaporaso/4197809 to your computer and use it in GitHub Desktop.
first pass at code for filtering OTUs that show up in negative control samples

Script for filtering OTUs that show up in negative control samples. This is a first pass at testing a process that was developed for the Student Microbiome Project. The effect of this filtering has not be investigated in detail, so use at your own risk.

This script works as follows:

  1. Filter input OTU table to contain only the control samples (as indicated by the -s parameter)
  2. Compute the median or mean (specified with --abundance_f) abundance of each OTU in the control samples. Generate a list of OTUs where this value is >= the minimum abundance (specified with --min_abundance).
  3. Filter the OTUs identified in Step 2 from the input OTU table.
#!/usr/bin/env python
# File created on 03 Dec 2012
from __future__ import division
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME project"
__credits__ = ["Greg Caporaso"]
__license__ = "GPL"
__version__ = "1.5.0-dev"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"
__status__ = "Development"
from os.path import split, splitext
from numpy import median, mean
from biom.parse import parse_biom_table
from qiime.util import parse_command_line_parameters, make_option, create_dir
from qiime.filter import sample_ids_from_metadata_description
from qiime.format import format_biom_table
abundances_fs = {'median':median,
'mean':mean}
script_info = {}
script_info['brief_description'] = "Script for filtering OTUs that show up in negative control samples"
script_info['script_description'] = """This script works as follows:
1. Filter input OTU table to contain only the control samples (as indicated by the -s parameter)
2. Compute the median or mean (specified with --abundance_f) abundance of each OTU in the control samples. Generate a list of OTUs where this value is >= the minimum abundance (specified with --min_abundance).
3. Filter the OTUs identified in Step 2 from the input OTU table.
"""
script_info['script_usage'] = [("","Filter any OTUs whose median abundance in the control samples is >= 0.05",'%prog -i otu_table.biom -m map.txt -o out/ -s "Control:Yes" -a 0.05')]
script_info['output_description']= ""
script_info['required_options'] = [
make_option('-i','--input_otu_table_fp',
type="existing_filepath",
help='the input OTU table filepath'),
make_option('-m','--mapping_fp',
type="existing_filepath",
help='the input mapping filepath'),
make_option('-s','--control_states',
help=('state string describing which samples are controls'
' (e.g., "Control:True", where "Control" is a column in '
'mapping_fp and "True" is a value in that column)')),
make_option('-a','--min_abundance',type=float,
help=("Minimum median or mean abundance "
"(specify which with --abundance_f )"
" of an OTU across the control samples"
" to consider it a contaminant and filter it.")),
make_option('-o','--output_dir',type="new_dirpath",
help="the output directory")
]
script_info['optional_options'] = [
make_option('--abundance_f',type='choice',
choices=abundances_fs.keys(),
default='median',
help=("the function to use for summarizing "
"the abundances of each OTU."))
]
script_info['version'] = __version__
def filter_otu_table_to_control_samples(table,mapping_f,control_states):
control_sample_ids = sample_ids_from_metadata_description(mapping_f,control_states)
def f(values, sid, md):
return sid in control_sample_ids
return table.filterSamples(f)
def identify_otus_to_filter(table,min_abundance,abundance_f=median):
otus_to_filter = []
t = table.normObservationBySample()
for v, oid, omd in t.iterObservations():
if abundance_f(v) >= min_abundance:
try:
otus_to_filter.append((oid,omd['taxonomy']))
except TypeError:
# no taxonomy included
otus_to_filter.append((oid,None))
return otus_to_filter
def filter_control_otus(table,otus_to_filter):
def f(values, oid, md):
return oid in otus_to_filter
return table.filterObservations(f,invert=True)
def main():
option_parser, opts, args =\
parse_command_line_parameters(**script_info)
input_otu_table_fp = opts.input_otu_table_fp
input_otu_table = parse_biom_table(open(input_otu_table_fp))
input_otu_table_basename = splitext(split(input_otu_table_fp)[1])[0]
mapping_fp = opts.mapping_fp
control_states = opts.control_states
min_abundance = opts.min_abundance
abundance_f = abundances_fs[opts.abundance_f]
output_dir = opts.output_dir
create_dir(output_dir)
log_f = open('%s/filtered_otus.txt' % output_dir,'w')
filtered_biom_fp = '%s/%s.control_filtered_%s_%1.3f.biom' %\
(output_dir,input_otu_table_basename,opts.abundance_f,min_abundance)
filtered_biom_f = open(filtered_biom_fp,'w')
control_otu_table = filter_otu_table_to_control_samples(
input_otu_table,open(mapping_fp),control_states)
otus_to_filter = identify_otus_to_filter(
control_otu_table,min_abundance,abundance_f=abundance_f)
filtered_table = filter_control_otus(input_otu_table,
[o[0] for o in otus_to_filter])
filtered_biom_f.write(format_biom_table(filtered_table))
filtered_biom_f.close()
# log the OTU ids that are being filtered
log_f.write("#OTU ID\tTaxonomy (if any)\n")
for otu_id, taxa in otus_to_filter:
log_f.write("%s\t%s\n" % (str(otu_id),str(taxa)))
log_f.close()
if __name__ == "__main__":
main()
#SampleID Control Description
s1 No s1
s2 No s2
s3 No s3
c1 Yes c1
c2 Yes c2
{"rows": [{"id": "o1", "metadata": {"taxonomy": "a;b;c"}}, {"id": "o2", "metadata": {"taxonomy": "a;b;d"}}, {"id": "o3", "metadata": {"taxonomy": "a;b;e"}}, {"id": "o4", "metadata": {"taxonomy": "a;b;f"}}], "format": "Biological Observation Matrix 1.0.0", "data": [[0, 0, 66.0], [0, 1, 65.0], [0, 2, 2.0], [0, 3, 3.0], [0, 4, 6.0], [1, 0, 5.0], [1, 1, 6.0], [1, 3, 12.0], [1, 4, 3.0], [2, 0, 1.0], [2, 3, 85.0], [2, 4, 91.0], [3, 0, 28.0], [3, 1, 29.0], [3, 2, 98.0]], "columns": [{"id": "s1", "metadata": null}, {"id": "s2", "metadata": null}, {"id": "s3", "metadata": null}, {"id": "c1", "metadata": null}, {"id": "c2", "metadata": null}], "generated_by": "BIOM-Format 1.0.0c", "matrix_type": "sparse", "shape": [4, 5], "format_url": "http://biom-format.org", "date": "2012-12-03T12:42:38.166092", "type": "OTU table", "id": null, "matrix_element_type": "float"}
#!/usr/bin/env python
# File created on 03 Dec 2012
from __future__ import division
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME project"
__credits__ = ["Greg Caporaso"]
__license__ = "GPL"
__version__ = "1.5.0-dev"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"
__status__ = "Development"
from shutil import rmtree
from os.path import exists, join
from cogent.util.unit_test import TestCase, main
from cogent.util.misc import remove_files, create_dir
from biom.parse import parse_biom_table
from qiime.util import (get_qiime_temp_dir,
get_tmp_filename)
from qiime.test import initiate_timeout, disable_timeout
from filter_control_otus import (filter_otu_table_to_control_samples,
identify_otus_to_filter, filter_control_otus)
class IdentifyControlOtusTests(TestCase):
def setUp(self):
self.mapping_f = mapping_f.split("\n")
self.otu_table = parse_biom_table(otu_table.split("\n"))
self.otu_table_control = parse_biom_table(otu_table_control.split("\n"))
self.otu_table_data = parse_biom_table(otu_table_data.split("\n"))
self.otu_table_control_no_tax = parse_biom_table(otu_table_control_no_tax.split("\n"))
self.control_filtered_otu_table = parse_biom_table(control_filtered_otu_table.split("\n"))
def test_filter_otu_table_to_control_samples(self):
"""filter_otu_table_to_control_samples functions as expected"""
self.assertEqual(
filter_otu_table_to_control_samples(self.otu_table,self.mapping_f,"Control:Yes"),
self.otu_table_control)
self.assertEqual(
filter_otu_table_to_control_samples(self.otu_table,self.mapping_f,"Control:No"),
self.otu_table_data)
def test_identify_otus_to_filter(self):
"""identify_otus_to_filter functions as expected"""
self.assertEqual(
identify_otus_to_filter(self.otu_table_control,0.05),
[('o2','a;b;d'),('o3','a;b;e')])
self.assertEqual(
identify_otus_to_filter(self.otu_table_control,0.5),
[('o3','a;b;e')])
self.assertEqual(
identify_otus_to_filter(self.otu_table_control_no_tax,0.05),
[('o2',None),('o3',None)])
def test_filter_control_otus(self):
"""filter_control_otus functions as expected"""
self.assertEqual(filter_control_otus(self.otu_table,['o2','o3']),
self.control_filtered_otu_table)
mapping_f = """#SampleID Control Description
s1 No s1
s2 No s2
s3 No s3
c1 Yes c1
c2 Yes c2"""
otu_table = """{"rows": [{"id": "o1", "metadata": {"taxonomy": "a;b;c"}}, {"id": "o2", "metadata": {"taxonomy": "a;b;d"}}, {"id": "o3", "metadata": {"taxonomy": "a;b;e"}}, {"id": "o4", "metadata": {"taxonomy": "a;b;f"}}], "format": "Biological Observation Matrix 1.0.0", "data": [[0, 0, 66.0], [0, 1, 65.0], [0, 2, 2.0], [0, 3, 3.0], [0, 4, 6.0], [1, 0, 5.0], [1, 1, 6.0], [1, 3, 12.0], [1, 4, 3.0], [2, 0, 1.0], [2, 3, 85.0], [2, 4, 91.0], [3, 0, 28.0], [3, 1, 29.0], [3, 2, 98.0]], "columns": [{"id": "s1", "metadata": null}, {"id": "s2", "metadata": null}, {"id": "s3", "metadata": null}, {"id": "c1", "metadata": null}, {"id": "c2", "metadata": null}], "generated_by": "BIOM-Format 1.0.0c", "matrix_type": "sparse", "shape": [4, 5], "format_url": "http://biom-format.org", "date": "2012-12-03T12:42:38.166092", "type": "OTU table", "id": null, "matrix_element_type": "float"}"""
otu_table_control = """{"rows": [{"id": "o1", "metadata": {"taxonomy": "a;b;c"}}, {"id": "o2", "metadata": {"taxonomy": "a;b;d"}}, {"id": "o3", "metadata": {"taxonomy": "a;b;e"}}, {"id": "o4", "metadata": {"taxonomy": "a;b;f"}}], "format": "Biological Observation Matrix 1.0.0", "data": [[0, 0, 3.0], [0, 1, 6.0], [1, 0, 12.0], [1, 1, 3.0], [2, 0, 85.0], [2, 1, 91.0]], "columns": [{"id": "c1", "metadata": null}, {"id": "c2", "metadata": null}], "generated_by": "BIOM-Format 1.0.0c", "matrix_type": "sparse", "shape": [4, 2], "format_url": "http://biom-format.org", "date": "2012-12-03T12:56:49.581524", "type": "OTU table", "id": null, "matrix_element_type": "float"}"""
otu_table_control_no_tax = """{"rows": [{"id": "o1", "metadata":null}, {"id": "o2", "metadata": null}, {"id": "o3", "metadata": null}, {"id": "o4", "metadata": null}], "format": "Biological Observation Matrix 1.0.0", "data": [[0, 0, 3.0], [0, 1, 6.0], [1, 0, 12.0], [1, 1, 3.0], [2, 0, 85.0], [2, 1, 91.0]], "columns": [{"id": "c1", "metadata": null}, {"id": "c2", "metadata": null}], "generated_by": "BIOM-Format 1.0.0c", "matrix_type": "sparse", "shape": [4, 2], "format_url": "http://biom-format.org", "date": "2012-12-03T12:56:49.581524", "type": "OTU table", "id": null, "matrix_element_type": "float"}"""
otu_table_data = """{"rows": [{"id": "o1", "metadata": {"taxonomy": "a;b;c"}}, {"id": "o2", "metadata": {"taxonomy": "a;b;d"}}, {"id": "o3", "metadata": {"taxonomy": "a;b;e"}}, {"id": "o4", "metadata": {"taxonomy": "a;b;f"}}], "format": "Biological Observation Matrix 1.0.0", "data": [[0, 0, 66.0], [0, 1, 65.0], [0, 2, 2.0], [1, 0, 5.0], [1, 1, 6.0], [2, 0, 1.0], [3, 0, 28.0], [3, 1, 29.0], [3, 2, 98.0]], "columns": [{"id": "s1", "metadata": null}, {"id": "s2", "metadata": null}, {"id": "s3", "metadata": null}], "generated_by": "BIOM-Format 1.0.0c", "matrix_type": "sparse", "shape": [4, 3], "format_url": "http://biom-format.org", "date": "2012-12-03T12:49:46.439032", "type": "OTU table", "id": null, "matrix_element_type": "float"}"""
control_filtered_otu_table = """{"rows": [{"id": "o1", "metadata": {"taxonomy": "a;b;c"}}, {"id": "o4", "metadata": {"taxonomy": "a;b;f"}}], "format": "Biological Observation Matrix 1.0.0", "data": [[0, 0, 66.0], [0, 1, 65.0], [0, 2, 2.0], [0, 3, 3.0], [0, 4, 6.0], [1, 0, 28.0], [1, 1, 29.0], [1, 2, 98.0]], "columns": [{"id": "s1", "metadata": null}, {"id": "s2", "metadata": null}, {"id": "s3", "metadata": null}, {"id": "c1", "metadata": null}, {"id": "c2", "metadata": null}], "generated_by": "BIOM-Format 1.0.0c", "matrix_type": "sparse", "shape": [2, 5], "format_url": "http://biom-format.org", "date": "2012-12-03T13:26:19.896552", "type": "OTU table", "id": null, "matrix_element_type": "float"}"""
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment