kfuku52/infer_frames.py

## infer_frames.py
import sys

def infer_frames(gff_file):
    with open(gff_file, "r") as infile, open("output.gff", "w") as outfile:
        current_gene = None
        cds_list = []

        def process_cds_list(cds_list, strand):
            if strand == "+":
                previous_cds = None
                for cds in cds_list:
                    if previous_cds is None:
                        cds["frame"] = "0"
                        previous_cds = cds
                    else:
                        length = cds["end"] - cds["start"] + 1
                        previous_exon_length = previous_cds["end"] - previous_cds["start"] + 1
                        frame = (int(previous_cds["frame"]) + previous_exon_length) % 3
                        cds["frame"] = str(frame)
                        previous_cds = cds
            else:  # strand == "-"
                cds_list.reverse()
                previous_cds = None
                for cds in cds_list:
                    if previous_cds is None:
                        cds["frame"] = "0"
                        previous_cds = cds
                    else:
                        length = cds["end"] - cds["start"] + 1
                        previous_exon_length = previous_cds["end"] - previous_cds["start"] + 1
                        frame = (int(previous_cds["frame"]) + previous_exon_length) % 3
                        cds["frame"] = str(frame)
                        previous_cds = cds

            for cds in cds_list:
                outfile.write("\t".join(cds["columns"][:7] + [cds["frame"]] + cds["columns"][8:]) + "\n")

        for line in infile:
            if line.startswith("#"):
                outfile.write(line)
                continue

            columns = line.strip().split("\t")

            if len(columns) < 8:
                continue

            feature_type = columns[2]

            if feature_type == "CDS" and current_gene is not None:
                cds_list.append({
                    "start": int(columns[3]),
                    "end": int(columns[4]),
                    "frame": columns[7],
                    "columns": columns
                })
            else:
                if current_gene is not None and cds_list:
                    process_cds_list(cds_list, current_gene["strand"])
                    cds_list = []

                current_gene = {
                    "id": columns[8],
                    "strand": columns[6]
                }

                outfile.write(line)

        if current_gene is not None and cds_list:
            process_cds_list(cds_list, current_gene["strand"])

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python infer_frames.py <gff_file>")
        sys.exit(1)

    gff_file = sys.argv[1]
    infer_frames(gff_file)
	import sys

	def infer_frames(gff_file):
	with open(gff_file, "r") as infile, open("output.gff", "w") as outfile:
	current_gene = None
	cds_list = []

	def process_cds_list(cds_list, strand):
	if strand == "+":
	previous_cds = None
	for cds in cds_list:
	if previous_cds is None:
	cds["frame"] = "0"
	previous_cds = cds
	else:
	length = cds["end"] - cds["start"] + 1
	previous_exon_length = previous_cds["end"] - previous_cds["start"] + 1
	frame = (int(previous_cds["frame"]) + previous_exon_length) % 3
	cds["frame"] = str(frame)
	previous_cds = cds
	else: # strand == "-"
	cds_list.reverse()
	previous_cds = None
	for cds in cds_list:
	if previous_cds is None:
	cds["frame"] = "0"
	previous_cds = cds
	else:
	length = cds["end"] - cds["start"] + 1
	previous_exon_length = previous_cds["end"] - previous_cds["start"] + 1
	frame = (int(previous_cds["frame"]) + previous_exon_length) % 3
	cds["frame"] = str(frame)
	previous_cds = cds

	for cds in cds_list:
	outfile.write("\t".join(cds["columns"][:7] + [cds["frame"]] + cds["columns"][8:]) + "\n")

	for line in infile:
	if line.startswith("#"):
	outfile.write(line)
	continue

	columns = line.strip().split("\t")

	if len(columns) < 8:
	continue

	feature_type = columns[2]

	if feature_type == "CDS" and current_gene is not None:
	cds_list.append({
	"start": int(columns[3]),
	"end": int(columns[4]),
	"frame": columns[7],
	"columns": columns
	})
	else:
	if current_gene is not None and cds_list:
	process_cds_list(cds_list, current_gene["strand"])
	cds_list = []

	current_gene = {
	"id": columns[8],
	"strand": columns[6]
	}

	outfile.write(line)

	if current_gene is not None and cds_list:
	process_cds_list(cds_list, current_gene["strand"])

	if __name__ == "__main__":
	if len(sys.argv) != 2:
	print("Usage: python infer_frames.py <gff_file>")
	sys.exit(1)

	gff_file = sys.argv[1]
	infer_frames(gff_file)