amplyfyer
""" | |
A scrappy esoteric glue program for Ben because he only really likes databases. | |
Given a tab delimited file where: | |
fields[0] ORF id | |
fields[1] 1-pos Start | |
fields[2] 1-pos End | |
fields[-1] Structural Prediction Code | |
Output a cast dataframe with a line per ORF, describing all structural predictions | |
for that ORF as a single string. Additionally enumerating each viable code's proportion | |
of that string as a percentage. You may alter CODES to add/remove prediction codes | |
as needed. | |
It is anticipated that the input is sorted by ORF and start position. | |
Things will go wrong if that is not the case... | |
Output is printed to stdout, warnings to stderr. | |
Usage: python amplyfy.py <data.tab> | |
Author: Sam Nicholls <msn@aber.ac.uk> | |
Version: 0.1.1 | |
""" | |
import sys | |
def print_orf(orf_id, orf_codelist): | |
"""Given a list of ORF code substrings, output the necessary data row.""" | |
orf = "".join(orf_codelist) | |
code_pc = [] | |
for c in CODES: | |
code_pc.append( orf.count(c) / float(len(orf)) ) | |
print("\t".join([orf_id, orf] + ["%.3f" % (x * 100) for x in code_pc])) | |
INPUT_F = open(sys.argv[1]) | |
CODES = ['H', 'C', 'T', 'E'] # valid codes, Ben: add new codes to this if desired. | |
# Print header | |
print("\t".join(["ORF_id", "secondary_structure"] + ["%s_pc" % c for c in CODES])) | |
observed_orf_ids = set([]) | |
orf_codes = [] | |
last_end = 1 | |
current_orf_id = None | |
for line in INPUT_F: | |
fields = line.strip().split("\t") | |
if not current_orf_id: | |
# Assign first ORF | |
current_orf_id = fields[0] | |
elif fields[0] != current_orf_id: | |
# Complete ORF and reset variables | |
print_orf(current_orf_id, orf_codes) | |
observed_orf_ids.add(current_orf_id) | |
current_orf_id = fields[0] | |
orf_codes = [] | |
last_end = 1 | |
start = int(fields[1]) | |
end = int(fields[2]) | |
code = fields[-1] | |
if current_orf_id in observed_orf_ids: | |
sys.stderr.write("ORF '%s' has been processed already, was your input file sorted?\n" % current_orf_id) | |
if last_end + 1 != start and last_end != 1: | |
sys.stderr.write("Next start for '%s' at Pos %d does not follow from the last end.\n" % (current_orf_id, start)) | |
if code not in CODES: | |
sys.stderr.write("Prediction Code '%s' not in the list of valid codes...\n" % code) | |
# Add code characters to ongoing ORF list | |
orf_codes.append ( (end - (start-1)) * code ) | |
last_end = end | |
# Don't forget last ORF! | |
print_orf(current_orf_id, orf_codes) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment