Created
August 18, 2011 22:30
-
-
Save jstvz/1155437 to your computer and use it in GitHub Desktop.
KAAS annotation engine htext parser.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
""" | |
Script for parsing the output of the KAAS annotation engine. | |
""" | |
import sys | |
import csv | |
import os | |
from optparse import OptionParser | |
usage = "usage: %prog [options] arg" | |
parser = OptionParser(usage) | |
parser.add_option("-i", "--input", dest="input",help="The q00001.keg htext file from the hier.tar.gz output from KAAS") | |
parser.add_option("-o", "--output", dest="output",help="Prefix for output files: i.e. prefix_raw_KEGG.txt, prefix_superpathways_KEGG.txt, etc.") | |
parser.add_option("-g", "--gene-names", action="store_true", dest="gene_names", default=False, help="Include predicted gene names in output") | |
(options, args) = parser.parse_args() | |
if options.input is None: | |
print "Input is missing!" | |
parser.print_help() | |
sys.exit(-1) | |
pathway_ko_lists = {} | |
raw_output_filename = options.output + '_raw_KEGG.txt' | |
superpathway_output_filename = options.output + '_superpathways_KEGG.txt' | |
csv_output_filename = options.output + '_KEGG.csv' | |
#raw_output = csv.writer(open(raw_output_handle, 'wb'), dialect = 'excel') | |
#superpathway_output = csv.writer(open(superpathway_output_handle, 'wb'), dialect = 'excel') | |
raw_output = open(raw_output_filename, 'wb') | |
superpathway_output = open(superpathway_output_filename, 'wb') | |
csv_output_handle = open(csv_output_filename, 'wb') | |
csv_output = csv.writer(csv_output_handle, dialect = 'excel') | |
if options.gene_names: | |
ko_output_filename = options.output + '_KEGG.ko' | |
ko_output_handle = open(ko_output_filename, 'wb') | |
ko_output = csv.writer(ko_output_handle, dialect = 'excel') | |
super_pathway_count = 0 | |
for inline in open(options.input): | |
if inline.startswith('A'): # Top level Category, e.g. "Metabolism" | |
category = inline.split('>')[1].strip('</b') | |
elif inline.startswith('B'): # SuperPathways, e.g."Carbohydrate Metabolism" | |
if super_pathway_count != 0: | |
superpathway_output.write('%s\t%i\n' % (super_pathway, super_pathway_count)) | |
try: | |
super_pathway = inline.split('>')[1].strip('</b') | |
super_pathway_count = 0 | |
except: | |
pass | |
else: | |
try: | |
super_pathway = inline.split('>')[1].strip('</b') | |
super_pathway_count = 0 | |
except: | |
pass | |
elif inline.startswith('C'): # KO Reference Pathway, e.g. "Glycolysis / Gluconeogenesis" | |
inline = inline.split('[') | |
pathway_list = inline[0].split()[2:] | |
pathway_name = ' '.join(pathway_list) | |
pathway_ko_number = 'K' + inline[0].split()[1] | |
pathway_ko_list = [] | |
raw_output.write('\n%s\t\t' % (pathway_name)) | |
elif inline.startswith('D'): # Query KO annotation | |
inline = inline.split(';') | |
contig = inline[0].strip('D ') | |
gene_ko_number = inline[1].split('>')[1].strip('</a') | |
pathway_ko_list.append(gene_ko_number) | |
super_pathway_count += 1 | |
raw_output.write('%s\t' % (gene_ko_number)) | |
csv_output.writerow([contig, gene_ko_number, pathway_name, pathway_ko_number, super_pathway]) | |
if options.gene_names: | |
gene_description = inline[2].strip() | |
gene_name = inline[1].split('>')[2].strip() | |
ko_output.writerow([gene_ko_number, gene_name, gene_description]) | |
if options.gene_names: | |
ko_output.close() | |
raw_output.close() | |
superpathway_output.close() | |
os.system("sort %s | uniq > /tmp/tmp.txt && cat /tmp/tmp.txt > %s" % (superpathway_output_filename, superpathway_output_filename)) | |
os.system("sort %s | uniq > /tmp/tmp.txt && cat /tmp/tmp.txt > %s" % (raw_output_filename, raw_output_filename)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment