Last active
April 26, 2023 14:45
-
-
Save kfuku52/96612cb8a78562a6ae9c9e38caa4a355 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
def infer_frames(gff_file): | |
with open(gff_file, "r") as infile, open("output.gff", "w") as outfile: | |
current_gene = None | |
cds_list = [] | |
def process_cds_list(cds_list, strand): | |
if strand == "+": | |
previous_cds = None | |
for cds in cds_list: | |
if previous_cds is None: | |
cds["frame"] = "0" | |
previous_cds = cds | |
else: | |
length = cds["end"] - cds["start"] + 1 | |
previous_exon_length = previous_cds["end"] - previous_cds["start"] + 1 | |
frame = (int(previous_cds["frame"]) + previous_exon_length) % 3 | |
cds["frame"] = str(frame) | |
previous_cds = cds | |
else: # strand == "-" | |
cds_list.reverse() | |
previous_cds = None | |
for cds in cds_list: | |
if previous_cds is None: | |
cds["frame"] = "0" | |
previous_cds = cds | |
else: | |
length = cds["end"] - cds["start"] + 1 | |
previous_exon_length = previous_cds["end"] - previous_cds["start"] + 1 | |
frame = (int(previous_cds["frame"]) + previous_exon_length) % 3 | |
cds["frame"] = str(frame) | |
previous_cds = cds | |
for cds in cds_list: | |
outfile.write("\t".join(cds["columns"][:7] + [cds["frame"]] + cds["columns"][8:]) + "\n") | |
for line in infile: | |
if line.startswith("#"): | |
outfile.write(line) | |
continue | |
columns = line.strip().split("\t") | |
if len(columns) < 8: | |
continue | |
feature_type = columns[2] | |
if feature_type == "CDS" and current_gene is not None: | |
cds_list.append({ | |
"start": int(columns[3]), | |
"end": int(columns[4]), | |
"frame": columns[7], | |
"columns": columns | |
}) | |
else: | |
if current_gene is not None and cds_list: | |
process_cds_list(cds_list, current_gene["strand"]) | |
cds_list = [] | |
current_gene = { | |
"id": columns[8], | |
"strand": columns[6] | |
} | |
outfile.write(line) | |
if current_gene is not None and cds_list: | |
process_cds_list(cds_list, current_gene["strand"]) | |
if __name__ == "__main__": | |
if len(sys.argv) != 2: | |
print("Usage: python infer_frames.py <gff_file>") | |
sys.exit(1) | |
gff_file = sys.argv[1] | |
infer_frames(gff_file) |
Sorry ChatGPT, I wasn't accurate enough in framing the task. A bug on the minus-strand genes has been fixed.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This code was generated by ChatGPT (GPT4) on 20230418 using the following prompt:
You are an expert bioinformatician. You obtained a gff file. CDS features should have frame information (0, 1, or 2) in the 8th column of the gff format, but this file did not have it. Assuming that the first CDS features (CDS1) always start with the frame 0, you have to infer the frames of the second CDS and beyond, in multi-exon genes. Please implement a python program for this task.