jstvz/ko_to_pathwway.py

## ko_to_pathwway.py
#!/usr/bin/python
"""
Script for parsing the output of the KAAS annotation engine.
"""

import sys
import csv
import os
from optparse import OptionParser

usage = "usage: %prog [options] arg"
parser = OptionParser(usage)
parser.add_option("-i", "--input", dest="input",help="The q00001.keg htext file from the hier.tar.gz output from KAAS")
parser.add_option("-o", "--output", dest="output",help="Prefix for output files: i.e. prefix_raw_KEGG.txt, prefix_superpathways_KEGG.txt, etc.")
parser.add_option("-g", "--gene-names", action="store_true", dest="gene_names", default=False, help="Include predicted gene names in output")

(options, args) = parser.parse_args()
if options.input is None:
    print "Input is missing!"
    parser.print_help()
    sys.exit(-1)

pathway_ko_lists = {}
raw_output_filename = options.output + '_raw_KEGG.txt'
superpathway_output_filename = options.output + '_superpathways_KEGG.txt'
csv_output_filename = options.output + '_KEGG.csv'

#raw_output = csv.writer(open(raw_output_handle, 'wb'), dialect = 'excel')
#superpathway_output = csv.writer(open(superpathway_output_handle, 'wb'), dialect = 'excel')

raw_output = open(raw_output_filename, 'wb')
superpathway_output = open(superpathway_output_filename, 'wb')
csv_output_handle = open(csv_output_filename, 'wb')
csv_output = csv.writer(csv_output_handle, dialect = 'excel')

if options.gene_names:
    ko_output_filename = options.output + '_KEGG.ko'
    ko_output_handle = open(ko_output_filename, 'wb')
    ko_output = csv.writer(ko_output_handle, dialect = 'excel')
super_pathway_count = 0
for inline in open(options.input):
    if inline.startswith('A'): # Top level Category, e.g. "Metabolism"
        category = inline.split('>')[1].strip('</b')
    elif inline.startswith('B'): # SuperPathways, e.g."Carbohydrate Metabolism"
        if super_pathway_count != 0:
            superpathway_output.write('%s\t%i\n' % (super_pathway, super_pathway_count))
            try:
                super_pathway = inline.split('>')[1].strip('</b')
                super_pathway_count = 0
            except:
                pass
        else:
            try:
                super_pathway = inline.split('>')[1].strip('</b')
                super_pathway_count = 0
            except:
                pass
    elif inline.startswith('C'): # KO Reference Pathway, e.g. "Glycolysis / Gluconeogenesis"
        inline = inline.split('[')
        pathway_list = inline[0].split()[2:]
        pathway_name = ' '.join(pathway_list)
        pathway_ko_number = 'K' + inline[0].split()[1]
        pathway_ko_list = []
        raw_output.write('\n%s\t\t' % (pathway_name))
    elif inline.startswith('D'): # Query KO annotation
        inline = inline.split(';')
        contig = inline[0].strip('D ')
        gene_ko_number = inline[1].split('>')[1].strip('</a')
        pathway_ko_list.append(gene_ko_number)
        super_pathway_count += 1
        raw_output.write('%s\t' % (gene_ko_number))
        csv_output.writerow([contig, gene_ko_number, pathway_name, pathway_ko_number, super_pathway])
        if options.gene_names:
            gene_description = inline[2].strip()
            gene_name = inline[1].split('>')[2].strip()
            ko_output.writerow([gene_ko_number, gene_name, gene_description])

if options.gene_names:
    ko_output.close()
raw_output.close()
superpathway_output.close()

os.system("sort %s | uniq > /tmp/tmp.txt && cat /tmp/tmp.txt > %s" % (superpathway_output_filename, superpathway_output_filename))
os.system("sort %s | uniq > /tmp/tmp.txt && cat /tmp/tmp.txt > %s" % (raw_output_filename, raw_output_filename))
	#!/usr/bin/python
	"""
	Script for parsing the output of the KAAS annotation engine.
	"""

	import sys
	import csv
	import os
	from optparse import OptionParser

	usage = "usage: %prog [options] arg"
	parser = OptionParser(usage)
	parser.add_option("-i", "--input", dest="input",help="The q00001.keg htext file from the hier.tar.gz output from KAAS")
	parser.add_option("-o", "--output", dest="output",help="Prefix for output files: i.e. prefix_raw_KEGG.txt, prefix_superpathways_KEGG.txt, etc.")
	parser.add_option("-g", "--gene-names", action="store_true", dest="gene_names", default=False, help="Include predicted gene names in output")

	(options, args) = parser.parse_args()
	if options.input is None:
	print "Input is missing!"
	parser.print_help()
	sys.exit(-1)

	pathway_ko_lists = {}
	raw_output_filename = options.output + '_raw_KEGG.txt'
	superpathway_output_filename = options.output + '_superpathways_KEGG.txt'
	csv_output_filename = options.output + '_KEGG.csv'

	#raw_output = csv.writer(open(raw_output_handle, 'wb'), dialect = 'excel')
	#superpathway_output = csv.writer(open(superpathway_output_handle, 'wb'), dialect = 'excel')

	raw_output = open(raw_output_filename, 'wb')
	superpathway_output = open(superpathway_output_filename, 'wb')
	csv_output_handle = open(csv_output_filename, 'wb')
	csv_output = csv.writer(csv_output_handle, dialect = 'excel')

	if options.gene_names:
	ko_output_filename = options.output + '_KEGG.ko'
	ko_output_handle = open(ko_output_filename, 'wb')
	ko_output = csv.writer(ko_output_handle, dialect = 'excel')
	super_pathway_count = 0
	for inline in open(options.input):
	if inline.startswith('A'): # Top level Category, e.g. "Metabolism"
	category = inline.split('>')[1].strip('</b')
	elif inline.startswith('B'): # SuperPathways, e.g."Carbohydrate Metabolism"
	if super_pathway_count != 0:
	superpathway_output.write('%s\t%i\n' % (super_pathway, super_pathway_count))
	try:
	super_pathway = inline.split('>')[1].strip('</b')
	super_pathway_count = 0
	except:
	pass
	else:
	try:
	super_pathway = inline.split('>')[1].strip('</b')
	super_pathway_count = 0
	except:
	pass
	elif inline.startswith('C'): # KO Reference Pathway, e.g. "Glycolysis / Gluconeogenesis"
	inline = inline.split('[')
	pathway_list = inline[0].split()[2:]
	pathway_name = ' '.join(pathway_list)
	pathway_ko_number = 'K' + inline[0].split()[1]
	pathway_ko_list = []
	raw_output.write('\n%s\t\t' % (pathway_name))
	elif inline.startswith('D'): # Query KO annotation
	inline = inline.split(';')
	contig = inline[0].strip('D ')
	gene_ko_number = inline[1].split('>')[1].strip('</a')
	pathway_ko_list.append(gene_ko_number)
	super_pathway_count += 1
	raw_output.write('%s\t' % (gene_ko_number))
	csv_output.writerow([contig, gene_ko_number, pathway_name, pathway_ko_number, super_pathway])
	if options.gene_names:
	gene_description = inline[2].strip()
	gene_name = inline[1].split('>')[2].strip()
	ko_output.writerow([gene_ko_number, gene_name, gene_description])

	if options.gene_names:
	ko_output.close()
	raw_output.close()
	superpathway_output.close()

	os.system("sort %s \| uniq > /tmp/tmp.txt && cat /tmp/tmp.txt > %s" % (superpathway_output_filename, superpathway_output_filename))
	os.system("sort %s \| uniq > /tmp/tmp.txt && cat /tmp/tmp.txt > %s" % (raw_output_filename, raw_output_filename))