Last active
May 12, 2016 13:59
-
-
Save meren/4d969b37df61d35bf0baad6baf8092ab to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 | |
"""This is a simple script to convert ITEP output into a matrix. | |
Essentially it takes an ITEP flat clusters file, and generates | |
the `data.txt` file mentioned in this article: | |
http://merenlab.org/2015/11/14/pangenomics/ | |
""" | |
import sys | |
import argparse | |
import anvio.terminal as terminal | |
import anvio.filesnpaths as filesnpaths | |
from anvio.errors import ConfigError, FilesNPathsError | |
run = terminal.Run() | |
progress = terminal.Progress() | |
def main(itep_clusters_path, output_file_path): | |
filesnpaths.is_file_exists(itep_clusters_path) | |
filesnpaths.is_output_file_writable(output_file_path) | |
itep_clusters = {} | |
genomes_found = set([]) | |
progress.new("Reading ITEP clusters") | |
progress.update('...') | |
for line in (l.strip() for l in open(itep_clusters_path).readlines()): | |
try: | |
info, cluster_id = line.split('\t') | |
except: | |
progress.end() | |
raise ConfigError, "This input file does not seem to be an ITEP output\ | |
this script is expecting (i.e., there aren't three\ | |
TAB-delimted columsn in this file)." | |
genome_name = info.split('|')[3].split('_')[0] | |
genomes_found.add(genome_name) | |
if not itep_clusters.has_key(cluster_id): | |
itep_clusters[cluster_id] = {} | |
if not itep_clusters[cluster_id].has_key(genome_name): | |
itep_clusters[cluster_id][genome_name] = 0 | |
itep_clusters[cluster_id][genome_name] += 1 | |
progress.end() | |
run.info("Num genomes", len(genomes_found)) | |
run.info("Num protein clusters", len(itep_clusters)) | |
progress.new("Creating the output file") | |
progress.update('...') | |
genomes_found = sorted(list(genomes_found)) | |
output_file = open(output_file_path, 'w') | |
output_file.write('\t'.join(['contig'] + genomes_found) + '\n') | |
cluster_ids_sorted = [t[1] for t in sorted([(sum(itep_clusters[cid].values()), cid) for cid in itep_clusters], reverse=True)] | |
for cluster_id in cluster_ids_sorted: | |
values_dict = itep_clusters[cluster_id] | |
values = [str(values_dict[g]) if values_dict.has_key(g) else '0' for g in genomes_found] | |
output_file.write('\t'.join([cluster_id] + values) + '\n') | |
output_file.close() | |
progress.end() | |
run.info("Data file is generated", output_file_path) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='A simple script to convert ITEP output into data.txt') | |
parser.add_argument('itep_clusters', metavar = 'ITEP_CLUSTERS', | |
help = 'This is the ITEP output file you should find in\ | |
"flatclusters" directory') | |
parser.add_argument('-o', '--output-file', metavar = 'OUTPUT_FILE', | |
help = 'Where to store the information. The default is\ | |
"%(default)s".', default = 'data.txt') | |
args = parser.parse_args() | |
try: | |
main(args.itep_clusters, args.output_file) | |
except ConfigError, e: | |
print e | |
sys.exit(-1) | |
except FilesNPathsError, e: | |
print e | |
sys.exit(-2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment