Skip to content

Instantly share code, notes, and snippets.

@jstvz
Created August 18, 2011 22:30
Show Gist options
  • Save jstvz/1155437 to your computer and use it in GitHub Desktop.
Save jstvz/1155437 to your computer and use it in GitHub Desktop.
KAAS annotation engine htext parser.
#!/usr/bin/python
"""
Script for parsing the output of the KAAS annotation engine.
"""
import sys
import csv
import os
from optparse import OptionParser
usage = "usage: %prog [options] arg"
parser = OptionParser(usage)
parser.add_option("-i", "--input", dest="input",help="The q00001.keg htext file from the hier.tar.gz output from KAAS")
parser.add_option("-o", "--output", dest="output",help="Prefix for output files: i.e. prefix_raw_KEGG.txt, prefix_superpathways_KEGG.txt, etc.")
parser.add_option("-g", "--gene-names", action="store_true", dest="gene_names", default=False, help="Include predicted gene names in output")
(options, args) = parser.parse_args()
if options.input is None:
print "Input is missing!"
parser.print_help()
sys.exit(-1)
pathway_ko_lists = {}
raw_output_filename = options.output + '_raw_KEGG.txt'
superpathway_output_filename = options.output + '_superpathways_KEGG.txt'
csv_output_filename = options.output + '_KEGG.csv'
#raw_output = csv.writer(open(raw_output_handle, 'wb'), dialect = 'excel')
#superpathway_output = csv.writer(open(superpathway_output_handle, 'wb'), dialect = 'excel')
raw_output = open(raw_output_filename, 'wb')
superpathway_output = open(superpathway_output_filename, 'wb')
csv_output_handle = open(csv_output_filename, 'wb')
csv_output = csv.writer(csv_output_handle, dialect = 'excel')
if options.gene_names:
ko_output_filename = options.output + '_KEGG.ko'
ko_output_handle = open(ko_output_filename, 'wb')
ko_output = csv.writer(ko_output_handle, dialect = 'excel')
super_pathway_count = 0
for inline in open(options.input):
if inline.startswith('A'): # Top level Category, e.g. "Metabolism"
category = inline.split('>')[1].strip('</b')
elif inline.startswith('B'): # SuperPathways, e.g."Carbohydrate Metabolism"
if super_pathway_count != 0:
superpathway_output.write('%s\t%i\n' % (super_pathway, super_pathway_count))
try:
super_pathway = inline.split('>')[1].strip('</b')
super_pathway_count = 0
except:
pass
else:
try:
super_pathway = inline.split('>')[1].strip('</b')
super_pathway_count = 0
except:
pass
elif inline.startswith('C'): # KO Reference Pathway, e.g. "Glycolysis / Gluconeogenesis"
inline = inline.split('[')
pathway_list = inline[0].split()[2:]
pathway_name = ' '.join(pathway_list)
pathway_ko_number = 'K' + inline[0].split()[1]
pathway_ko_list = []
raw_output.write('\n%s\t\t' % (pathway_name))
elif inline.startswith('D'): # Query KO annotation
inline = inline.split(';')
contig = inline[0].strip('D ')
gene_ko_number = inline[1].split('>')[1].strip('</a')
pathway_ko_list.append(gene_ko_number)
super_pathway_count += 1
raw_output.write('%s\t' % (gene_ko_number))
csv_output.writerow([contig, gene_ko_number, pathway_name, pathway_ko_number, super_pathway])
if options.gene_names:
gene_description = inline[2].strip()
gene_name = inline[1].split('>')[2].strip()
ko_output.writerow([gene_ko_number, gene_name, gene_description])
if options.gene_names:
ko_output.close()
raw_output.close()
superpathway_output.close()
os.system("sort %s | uniq > /tmp/tmp.txt && cat /tmp/tmp.txt > %s" % (superpathway_output_filename, superpathway_output_filename))
os.system("sort %s | uniq > /tmp/tmp.txt && cat /tmp/tmp.txt > %s" % (raw_output_filename, raw_output_filename))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment