Skip to content

Instantly share code, notes, and snippets.

@SamStudio8
Last active November 13, 2017 16:11
Show Gist options
  • Save SamStudio8/f507ae1759f6c1bb16157532b9a894e5 to your computer and use it in GitHub Desktop.
Save SamStudio8/f507ae1759f6c1bb16157532b9a894e5 to your computer and use it in GitHub Desktop.
amplyfyer
"""
A scrappy esoteric glue program for Ben because he only really likes databases.
Given a tab delimited file where:
fields[0] ORF id
fields[1] 1-pos Start
fields[2] 1-pos End
fields[-1] Structural Prediction Code
Output a cast dataframe with a line per ORF, describing all structural predictions
for that ORF as a single string. Additionally enumerating each viable code's proportion
of that string as a percentage. You may alter CODES to add/remove prediction codes
as needed.
It is anticipated that the input is sorted by ORF and start position.
Things will go wrong if that is not the case...
Output is printed to stdout, warnings to stderr.
Usage: python amplyfy.py <data.tab>
Author: Sam Nicholls <msn@aber.ac.uk>
Version: 0.1.1
"""
import sys
def print_orf(orf_id, orf_codelist):
"""Given a list of ORF code substrings, output the necessary data row."""
orf = "".join(orf_codelist)
code_pc = []
for c in CODES:
code_pc.append( orf.count(c) / float(len(orf)) )
print("\t".join([orf_id, orf] + ["%.3f" % (x * 100) for x in code_pc]))
INPUT_F = open(sys.argv[1])
CODES = ['H', 'C', 'T', 'E'] # valid codes, Ben: add new codes to this if desired.
# Print header
print("\t".join(["ORF_id", "secondary_structure"] + ["%s_pc" % c for c in CODES]))
observed_orf_ids = set([])
orf_codes = []
last_end = 1
current_orf_id = None
for line in INPUT_F:
fields = line.strip().split("\t")
if not current_orf_id:
# Assign first ORF
current_orf_id = fields[0]
elif fields[0] != current_orf_id:
# Complete ORF and reset variables
print_orf(current_orf_id, orf_codes)
observed_orf_ids.add(current_orf_id)
current_orf_id = fields[0]
orf_codes = []
last_end = 1
start = int(fields[1])
end = int(fields[2])
code = fields[-1]
if current_orf_id in observed_orf_ids:
sys.stderr.write("ORF '%s' has been processed already, was your input file sorted?\n" % current_orf_id)
if last_end + 1 != start and last_end != 1:
sys.stderr.write("Next start for '%s' at Pos %d does not follow from the last end.\n" % (current_orf_id, start))
if code not in CODES:
sys.stderr.write("Prediction Code '%s' not in the list of valid codes...\n" % code)
# Add code characters to ongoing ORF list
orf_codes.append ( (end - (start-1)) * code )
last_end = end
# Don't forget last ORF!
print_orf(current_orf_id, orf_codes)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment