Last active
December 21, 2015 01:09
-
-
Save walterst/3144b1401fbda7bab464 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import division | |
__author__ = "William Walters" | |
__copyright__ = "Copyright 2011, The QIIME Project" | |
__credits__ = ["William Walters"] | |
__license__ = "GPL" | |
__version__ = "1.9.1-dev" | |
__maintainer__ = "William Walters" | |
__email__ = "William.A.Walters@colorado.edu" | |
from os.path import join | |
from biom import load_table | |
from qiime.util import parse_command_line_parameters, get_options_lookup,\ | |
make_option, create_dir, qiime_open | |
from qiime.parse import parse_newick, PhyloNode, parse_mapping_file_to_dict | |
options_lookup = get_options_lookup() | |
script_info={} | |
script_info['brief_description']="""Compares mapping file and OTU table | |
SampleIDs, and optionally, OTU table OTU IDs and tree tip IDs.""" | |
script_info['script_description']=""" This script is used to check for matching | |
SampleIDs between one's mapping file and OTU table. For most scripts, the | |
SampleIDs in the OTU table need to be equal to or a subset of the IDs in the | |
mapping file. Likewise, when a phylogenetic tree is used, the OTU IDs should | |
be an exact match or a subset of the tree tips. | |
This script will report the counts of IDs and tips that do not match, | |
and write text files containing IDs that do not match in separate text files | |
for easy parsing. | |
""" | |
script_info['output_description']="""A summary.txt file containing overview | |
of the results, and text files containing IDs that do not match are written | |
to the output directory.""" | |
script_info['required_options']= [\ | |
make_option('-m', '--mapping_fp',type='existing_filepath', | |
help='Metadata mapping filepath'), | |
make_option('-i', '--otu_table_fp',type='existing_filepath', | |
help='OTU table filepath') | |
] | |
script_info['optional_options']= [\ | |
make_option('-o', '--output_dir',type='new_dirpath', | |
help='Output directory [default: %default]', default="./"), | |
make_option('-t', '--tree_fp',type='existing_filepath', | |
help='phylogenetic tree filepath [default: %default]', default=None) | |
] | |
script_info['version'] = __version__ | |
def main(): | |
option_parser, opts, args =\ | |
parse_command_line_parameters(suppress_verbose=True, **script_info) | |
# Get SampleIDs from mapping file | |
mapping_sampleids =\ | |
set(parse_mapping_file_to_dict(opts.mapping_fp)[0].keys()) | |
# Get OTU table SampleIDs and OTU IDs | |
otu_table = load_table(opts.otu_table_fp) | |
otu_table_sampleids = set(otu_table.ids()) | |
otu_table_observationids = set(otu_table._observation_ids) | |
# If tree specified, get tip IDs | |
if opts.tree_fp: | |
tree = parse_newick(qiime_open(opts.tree_fp), PhyloNode) | |
tree_tips = set(tree.getTipNames()) | |
else: | |
tree = None | |
sample_ids_not_in_otu_table = mapping_sampleids - otu_table_sampleids | |
sample_ids_not_in_mapping = otu_table_sampleids - mapping_sampleids | |
if tree: | |
otu_ids_not_in_tree = otu_table_observationids - tree_tips | |
otu_ids_not_in_table = tree_tips - otu_table_observationids | |
create_dir(opts.output_dir) | |
summary_f = open(join(opts.output_dir, "summary.txt"), "w") | |
sample_ids_not_in_otu_table_f =\ | |
open(join(opts.output_dir, "SampleIDsnotinOTUTable.txt"), "w") | |
sample_ids_not_in_mapping_f =\ | |
open(join(opts.output_dir, "SampleIDsnotinMapping.txt"), "w") | |
if tree: | |
otu_ids_not_in_tree_f =\ | |
open(join(opts.output_dir, "OTUidsnotinTree.txt"), "w") | |
otu_ids_not_in_table_f =\ | |
open(join(opts.output_dir, "TreeTipidsnotinOTUTable.txt"), "w") | |
summary_f.write("Summary of mismatches SampleIDs and OTU IDs/tree tips\n") | |
summary_f.write("The OTU table *can* have a subset of the mapping file\n") | |
summary_f.write("SampleIDs and be a subset of the tree tips matching \n") | |
summary_f.write("OTU IDs and still function with QIIME scripts. The \n") | |
summary_f.write("converse situation isn't necessarily true though.\n\n") | |
summary_f.write("Count of SampleIDs in OTU table not found in mapping file: %d\n" % len(sample_ids_not_in_mapping)) | |
summary_f.write("Count of SampleIDs in mapping file not found in OTU table: %d\n" % len(sample_ids_not_in_otu_table)) | |
if tree: | |
summary_f.write("Count of OTU IDs in table not found in tree: %d\n" % len(otu_ids_not_in_table)) | |
summary_f.write("Count of tree tips not found in OTU table: %d\n" % len(otu_ids_not_in_tree)) | |
for n in sample_ids_not_in_otu_table: | |
sample_ids_not_in_otu_table_f.write("%s\n" % n) | |
for n in sample_ids_not_in_mapping: | |
sample_ids_not_in_mapping_f.write("%s\n" % n) | |
if tree: | |
for n in otu_ids_not_in_tree: | |
otu_ids_not_in_tree_f.write("%s\n" % n) | |
for n in otu_ids_not_in_table: | |
otu_ids_not_in_table_f.write("%s\n" % n) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment