Skip to content

Instantly share code, notes, and snippets.

@gregcaporaso
Last active December 18, 2015 10:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gregcaporaso/5769652 to your computer and use it in GitHub Desktop.
Save gregcaporaso/5769652 to your computer and use it in GitHub Desktop.
Hack for converting TGEN SNP pipeline output into files that can be converted to BIOM format for analysis with QIIME.

This is all UNTESTED CODE!!

Description

This software allows users to convert TGEN SNP tables to legacy-formatted QIIME OTU tables, which can then be converted into BIOM tables with convert_biom.py (from the biom-format package). This was quickly hacked together, so is untested, but intended to be useful in figuring out if making these tables available in BIOM format will support useful analyses.

Install

Required

  • qcli
  • python 2.6 or greater

Optional

Additionally useful software will be the biom-format 1.1.2 package (so you can convert the resulting files into BIOM format) and QIIME 1.7.0 (to perform diversity analyses on the resulting BIOM table).

Usage example

# create legacy formatted table and observation and sample metadata files
python snp_to_legacy_otu_table.py -i YP_matrix.txt -o out/

# convert them to BIOM format
python convert_biom.py -i out/table.txt -o out/table.biom --biom_table_type "gene table" --observation_mapping_fp out/obs_map.txt -m out/sample_map.txt

# run QIIME's core_diversity_analyses.py in non-phylogenetic mode
python core_diversity_analyses.py -i table.biom -o cd_43 -e 43 -m map.txt --nonphylogenetic_diversity --suppress_taxa_summary --suppress_alpha_diversity --suppress_otu_category_significance
#!/usr/bin/env python
# File created on 12 Jun 2013
from __future__ import division
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2013, The BiPy project"
__credits__ = ["Greg Caporaso", "Jason Sahl"]
__license__ = "GPL"
__version__ = "0.0.0-dev"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"
__status__ = "Development"
from os.path import join
from qcli import parse_command_line_parameters, make_option
script_info = {}
script_info['brief_description'] = ""
script_info['script_description'] = ""
script_info['script_usage'] = [("","","")]
script_info['output_description']= ""
script_info['required_options'] = [
make_option('-i','--input_fp',type="existing_filepath",help='the input filepath'),
make_option('-o','--output_dir',type="new_dirpath",help='the output directory'),
]
script_info['optional_options'] = []
script_info['version'] = __version__
def parse_snp_matrix(input_f):
"""
"""
sample_ids = []
locus_ids = []
references = []
matrix = []
for line in input_f:
line = line.strip()
if len(line) == 0 or line.startswith('#'):
continue
elif len(sample_ids) == 0:
# we're on the first line
fields = line.split('\t')
md_start = fields.index('#SNPcall')
sample_ids = fields[2:md_start]
else:
fields = line.split('\t')
locus_ids.append(fields[0])
reference = fields[1]
references.append(reference)
vector = []
for base in fields[2:md_start]:
if base == reference:
vector.append(0)
else:
vector.append(1)
matrix.append(vector)
return sample_ids, locus_ids, references, matrix
def main():
option_parser, opts, args =\
parse_command_line_parameters(**script_info)
sample_ids, locus_ids, references, matrix = parse_snp_matrix(open(opts.input_fp,'U'))
# write mapping file
map_f = open(join(opts.output_dir,'sample_map.txt'),'w')
map_f.write('#SampleID\tDescription\n')
for sample_id in sample_ids:
map_f.write('%s\t%s\n' % (sample_id,sample_id))
map_f.close()
# write legacy-formatted OTU table
table_f = open(join(opts.output_dir,'table.txt'),'w')
table_f.write('#ObservationId\t%s\n' % '\t'.join(sample_ids))
for i, vector in enumerate(matrix):
table_f.write('%d\t%s\n' % (i, '\t'.join(map(str,vector))))
table_f.close()
# write observation mapping file
obs_map_f = open(join(opts.output_dir,'obs_map.txt'),'w')
obs_map_f.write('#ObservationID\tDescription\n')
for i, locus_id in enumerate(locus_ids):
obs_map_f.write('%d\t%s\n' % (i, locus_id))
obs_map_f.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment