Skip to content

Instantly share code, notes, and snippets.

@kfuku52
Last active April 26, 2023 14:45
Show Gist options
  • Save kfuku52/96612cb8a78562a6ae9c9e38caa4a355 to your computer and use it in GitHub Desktop.
Save kfuku52/96612cb8a78562a6ae9c9e38caa4a355 to your computer and use it in GitHub Desktop.
import sys
def infer_frames(gff_file):
with open(gff_file, "r") as infile, open("output.gff", "w") as outfile:
current_gene = None
cds_list = []
def process_cds_list(cds_list, strand):
if strand == "+":
previous_cds = None
for cds in cds_list:
if previous_cds is None:
cds["frame"] = "0"
previous_cds = cds
else:
length = cds["end"] - cds["start"] + 1
previous_exon_length = previous_cds["end"] - previous_cds["start"] + 1
frame = (int(previous_cds["frame"]) + previous_exon_length) % 3
cds["frame"] = str(frame)
previous_cds = cds
else: # strand == "-"
cds_list.reverse()
previous_cds = None
for cds in cds_list:
if previous_cds is None:
cds["frame"] = "0"
previous_cds = cds
else:
length = cds["end"] - cds["start"] + 1
previous_exon_length = previous_cds["end"] - previous_cds["start"] + 1
frame = (int(previous_cds["frame"]) + previous_exon_length) % 3
cds["frame"] = str(frame)
previous_cds = cds
for cds in cds_list:
outfile.write("\t".join(cds["columns"][:7] + [cds["frame"]] + cds["columns"][8:]) + "\n")
for line in infile:
if line.startswith("#"):
outfile.write(line)
continue
columns = line.strip().split("\t")
if len(columns) < 8:
continue
feature_type = columns[2]
if feature_type == "CDS" and current_gene is not None:
cds_list.append({
"start": int(columns[3]),
"end": int(columns[4]),
"frame": columns[7],
"columns": columns
})
else:
if current_gene is not None and cds_list:
process_cds_list(cds_list, current_gene["strand"])
cds_list = []
current_gene = {
"id": columns[8],
"strand": columns[6]
}
outfile.write(line)
if current_gene is not None and cds_list:
process_cds_list(cds_list, current_gene["strand"])
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python infer_frames.py <gff_file>")
sys.exit(1)
gff_file = sys.argv[1]
infer_frames(gff_file)
@kfuku52
Copy link
Author

kfuku52 commented Apr 18, 2023

This code was generated by ChatGPT (GPT4) on 20230418 using the following prompt:

You are an expert bioinformatician. You obtained a gff file. CDS features should have frame information (0, 1, or 2) in the 8th column of the gff format, but this file did not have it. Assuming that the first CDS features (CDS1) always start with the frame 0, you have to infer the frames of the second CDS and beyond, in multi-exon genes. Please implement a python program for this task.

@kfuku52
Copy link
Author

kfuku52 commented Apr 26, 2023

Sorry ChatGPT, I wasn't accurate enough in framing the task. A bug on the minus-strand genes has been fixed.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment