Skip to content

Instantly share code, notes, and snippets.

@walterst
Last active December 21, 2015 01:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save walterst/3144b1401fbda7bab464 to your computer and use it in GitHub Desktop.
Save walterst/3144b1401fbda7bab464 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
from __future__ import division
__author__ = "William Walters"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["William Walters"]
__license__ = "GPL"
__version__ = "1.9.1-dev"
__maintainer__ = "William Walters"
__email__ = "William.A.Walters@colorado.edu"
from os.path import join
from biom import load_table
from qiime.util import parse_command_line_parameters, get_options_lookup,\
make_option, create_dir, qiime_open
from qiime.parse import parse_newick, PhyloNode, parse_mapping_file_to_dict
options_lookup = get_options_lookup()
script_info={}
script_info['brief_description']="""Compares mapping file and OTU table
SampleIDs, and optionally, OTU table OTU IDs and tree tip IDs."""
script_info['script_description']=""" This script is used to check for matching
SampleIDs between one's mapping file and OTU table. For most scripts, the
SampleIDs in the OTU table need to be equal to or a subset of the IDs in the
mapping file. Likewise, when a phylogenetic tree is used, the OTU IDs should
be an exact match or a subset of the tree tips.
This script will report the counts of IDs and tips that do not match,
and write text files containing IDs that do not match in separate text files
for easy parsing.
"""
script_info['output_description']="""A summary.txt file containing overview
of the results, and text files containing IDs that do not match are written
to the output directory."""
script_info['required_options']= [\
make_option('-m', '--mapping_fp',type='existing_filepath',
help='Metadata mapping filepath'),
make_option('-i', '--otu_table_fp',type='existing_filepath',
help='OTU table filepath')
]
script_info['optional_options']= [\
make_option('-o', '--output_dir',type='new_dirpath',
help='Output directory [default: %default]', default="./"),
make_option('-t', '--tree_fp',type='existing_filepath',
help='phylogenetic tree filepath [default: %default]', default=None)
]
script_info['version'] = __version__
def main():
option_parser, opts, args =\
parse_command_line_parameters(suppress_verbose=True, **script_info)
# Get SampleIDs from mapping file
mapping_sampleids =\
set(parse_mapping_file_to_dict(opts.mapping_fp)[0].keys())
# Get OTU table SampleIDs and OTU IDs
otu_table = load_table(opts.otu_table_fp)
otu_table_sampleids = set(otu_table.ids())
otu_table_observationids = set(otu_table._observation_ids)
# If tree specified, get tip IDs
if opts.tree_fp:
tree = parse_newick(qiime_open(opts.tree_fp), PhyloNode)
tree_tips = set(tree.getTipNames())
else:
tree = None
sample_ids_not_in_otu_table = mapping_sampleids - otu_table_sampleids
sample_ids_not_in_mapping = otu_table_sampleids - mapping_sampleids
if tree:
otu_ids_not_in_tree = otu_table_observationids - tree_tips
otu_ids_not_in_table = tree_tips - otu_table_observationids
create_dir(opts.output_dir)
summary_f = open(join(opts.output_dir, "summary.txt"), "w")
sample_ids_not_in_otu_table_f =\
open(join(opts.output_dir, "SampleIDsnotinOTUTable.txt"), "w")
sample_ids_not_in_mapping_f =\
open(join(opts.output_dir, "SampleIDsnotinMapping.txt"), "w")
if tree:
otu_ids_not_in_tree_f =\
open(join(opts.output_dir, "OTUidsnotinTree.txt"), "w")
otu_ids_not_in_table_f =\
open(join(opts.output_dir, "TreeTipidsnotinOTUTable.txt"), "w")
summary_f.write("Summary of mismatches SampleIDs and OTU IDs/tree tips\n")
summary_f.write("The OTU table *can* have a subset of the mapping file\n")
summary_f.write("SampleIDs and be a subset of the tree tips matching \n")
summary_f.write("OTU IDs and still function with QIIME scripts. The \n")
summary_f.write("converse situation isn't necessarily true though.\n\n")
summary_f.write("Count of SampleIDs in OTU table not found in mapping file: %d\n" % len(sample_ids_not_in_mapping))
summary_f.write("Count of SampleIDs in mapping file not found in OTU table: %d\n" % len(sample_ids_not_in_otu_table))
if tree:
summary_f.write("Count of OTU IDs in table not found in tree: %d\n" % len(otu_ids_not_in_table))
summary_f.write("Count of tree tips not found in OTU table: %d\n" % len(otu_ids_not_in_tree))
for n in sample_ids_not_in_otu_table:
sample_ids_not_in_otu_table_f.write("%s\n" % n)
for n in sample_ids_not_in_mapping:
sample_ids_not_in_mapping_f.write("%s\n" % n)
if tree:
for n in otu_ids_not_in_tree:
otu_ids_not_in_tree_f.write("%s\n" % n)
for n in otu_ids_not_in_table:
otu_ids_not_in_table_f.write("%s\n" % n)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment