SamStudio8/amplyfy.py

## amplyfy.py
"""
A scrappy esoteric glue program for Ben because he only really likes databases.
Given a tab delimited file where:
    fields[0]  ORF id
    fields[1]  1-pos Start
    fields[2]  1-pos End
    fields[-1] Structural Prediction Code
Output a cast dataframe with a line per ORF, describing all structural predictions
for that ORF as a single string. Additionally enumerating each viable code's proportion
of that string as a percentage. You may alter CODES to add/remove prediction codes
as needed.

It is anticipated that the input is sorted by ORF and start position.
Things will go wrong if that is not the case...
Output is printed to stdout, warnings to stderr.

Usage: python amplyfy.py <data.tab>

Author: Sam Nicholls <msn@aber.ac.uk>
Version: 0.1.1
"""

import sys

def print_orf(orf_id, orf_codelist):
    """Given a list of ORF code substrings, output the necessary data row."""
    orf = "".join(orf_codelist)
    code_pc = []
    for c in CODES:
        code_pc.append( orf.count(c) / float(len(orf)) )
    print("\t".join([orf_id, orf] + ["%.3f" % (x * 100) for x in code_pc]))


INPUT_F = open(sys.argv[1])
CODES = ['H', 'C', 'T', 'E']    # valid codes, Ben: add new codes to this if desired.

# Print header
print("\t".join(["ORF_id", "secondary_structure"] + ["%s_pc" % c for c in CODES]))

observed_orf_ids = set([])
orf_codes = []
last_end = 1
current_orf_id = None
for line in INPUT_F:
    fields = line.strip().split("\t")

    if not current_orf_id:
        # Assign first ORF
        current_orf_id = fields[0]
    elif fields[0] != current_orf_id:
        # Complete ORF and reset variables
        print_orf(current_orf_id, orf_codes)
        observed_orf_ids.add(current_orf_id)

        current_orf_id = fields[0]
        orf_codes = []
        last_end = 1

    start = int(fields[1])
    end = int(fields[2])
    code = fields[-1]

    if current_orf_id in observed_orf_ids:
        sys.stderr.write("ORF '%s' has been processed already, was your input file sorted?\n" % current_orf_id)
    if last_end + 1 != start and last_end != 1:
        sys.stderr.write("Next start for '%s' at Pos %d does not follow from the last end.\n" % (current_orf_id, start))
    if code not in CODES:
        sys.stderr.write("Prediction Code '%s' not in the list of valid codes...\n" % code)

    # Add code characters to ongoing ORF list
    orf_codes.append ( (end - (start-1)) * code )
    last_end = end


# Don't forget last ORF!
print_orf(current_orf_id, orf_codes)
	"""
	A scrappy esoteric glue program for Ben because he only really likes databases.
	Given a tab delimited file where:
	fields[0] ORF id
	fields[1] 1-pos Start
	fields[2] 1-pos End
	fields[-1] Structural Prediction Code
	Output a cast dataframe with a line per ORF, describing all structural predictions
	for that ORF as a single string. Additionally enumerating each viable code's proportion
	of that string as a percentage. You may alter CODES to add/remove prediction codes
	as needed.

	It is anticipated that the input is sorted by ORF and start position.
	Things will go wrong if that is not the case...
	Output is printed to stdout, warnings to stderr.

	Usage: python amplyfy.py <data.tab>

	Author: Sam Nicholls <msn@aber.ac.uk>
	Version: 0.1.1
	"""

	import sys

	def print_orf(orf_id, orf_codelist):
	"""Given a list of ORF code substrings, output the necessary data row."""
	orf = "".join(orf_codelist)
	code_pc = []
	for c in CODES:
	code_pc.append( orf.count(c) / float(len(orf)) )
	print("\t".join([orf_id, orf] + ["%.3f" % (x * 100) for x in code_pc]))


	INPUT_F = open(sys.argv[1])
	CODES = ['H', 'C', 'T', 'E'] # valid codes, Ben: add new codes to this if desired.

	# Print header
	print("\t".join(["ORF_id", "secondary_structure"] + ["%s_pc" % c for c in CODES]))

	observed_orf_ids = set([])
	orf_codes = []
	last_end = 1
	current_orf_id = None
	for line in INPUT_F:
	fields = line.strip().split("\t")

	if not current_orf_id:
	# Assign first ORF
	current_orf_id = fields[0]
	elif fields[0] != current_orf_id:
	# Complete ORF and reset variables
	print_orf(current_orf_id, orf_codes)
	observed_orf_ids.add(current_orf_id)

	current_orf_id = fields[0]
	orf_codes = []
	last_end = 1

	start = int(fields[1])
	end = int(fields[2])
	code = fields[-1]

	if current_orf_id in observed_orf_ids:
	sys.stderr.write("ORF '%s' has been processed already, was your input file sorted?\n" % current_orf_id)
	if last_end + 1 != start and last_end != 1:
	sys.stderr.write("Next start for '%s' at Pos %d does not follow from the last end.\n" % (current_orf_id, start))
	if code not in CODES:
	sys.stderr.write("Prediction Code '%s' not in the list of valid codes...\n" % code)

	# Add code characters to ongoing ORF list
	orf_codes.append ( (end - (start-1)) * code )
	last_end = end


	# Don't forget last ORF!
	print_orf(current_orf_id, orf_codes)