Skip to content

Instantly share code, notes, and snippets.

@theboocock
Created April 19, 2023 01:12
Show Gist options
  • Select an option

  • Save theboocock/aacf72277a572ee3fe589c430bfd496e to your computer and use it in GitHub Desktop.

Select an option

Save theboocock/aacf72277a572ee3fe589c430bfd496e to your computer and use it in GitHub Desktop.
# Copyright (c) 2017 Boocock James <james.boocock@otago.ac.nz>
# Author: Boocock James <james.boocock@otago.ac.nz>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import pysam
import argparse
import statistics
class BamRow(object):
def __init__(self, reference_name, reference_start, reference_end, reference_length, query_name, gene_name):
"""
Load three prime reads.
"""
self.reference_name = reference_name
self.reference_start = reference_start
self.reference_end = reference_end
self.reference_length = reference_length
self.query_name = query_name
self.gene_name = gene_name
class GFFRow(object):
def __init__(self, chrom, start, end, info_column,gene_name,row):
"""
Simple construction for a GFF row.
"""
self.chrom = chrom
self.start = start
self.end = end
self.info_column = info_column
self.gene_name = gene_name
self.row_split = row
def set_start(self, reference_start):
"""
Change the start coordinate in the GFF.
"""
self.row_split[3] = str(int(reference_start)+1)
def set_end(self, reference_end):
"""
Change the end coordinate in the GFF.
"""
# Convert GFFROw into indices.
self.row_split[4] = str(int(reference_end))
def remove_from_start(self, reference_start):
"""
Change the start coordinate in the GFF.
"""
self.row_split[3] = str(int(self.start) - int(reference_start))
def add_to_end(self, reference_end):
"""
Change the end coordinate in the GFF.
"""
# Convert GFFROw into indices.
self.row_split[4] = str(int(self.end) + int(reference_end))
def __str__(self):
return("\t".join(self.row_split))
def bam_to_bed(bam_in):
input_bam= pysam.AlignmentFile(bam_in, "rb")
gene_list = {}
median_utr = []
for line in input_bam:
gene_name = (line.query_name.split("_")[4])
bam_row_tmp = BamRow(line.reference_name,line.reference_start, line.reference_end, line.reference_length, line.query_name, gene_name)
try:
gene_list[gene_name].append(bam_row_tmp)
except KeyError:
gene_list[gene_name] = [bam_row_tmp]
# TODO
median_utr.append(line.reference_length)
return(gene_list, statistics.median(median_utr))
def read_gff(gff_input):
gff_rows = {}
with open(gff_input) as f:
for line in f:
if "#" not in line:
line = line.strip().split("\t")
type_of_annot = line[2]
if type_of_annot == "CDS" or type_of_annot == "gene" or type_of_annot == "mRNA":
chrom = line[0]
start = line[3]
end = line[4]
info = line[8]
gene_id = info.split("=")[1].split(";")[0].split("_")[0]
try:
gff_rows[gene_id].append(GFFRow(chrom, start, end, info, gene_id, line))
except:
gff_rows[gene_id]=[(GFFRow(chrom, start, end, info, gene_id, line))]
return(gff_rows)
def get_longest_three_prime(list_of_three_prime):
"""
List of three prime.
"""
longest_three_prime = 0
for three_prime in list_of_three_prime:
if three_prime.reference_length > longest_three_prime:
longest_three_prime = three_prime.reference_length
return(longest_three_prime + 1)
def get_earliest_start(list_of_three_prime):
longest_three_prime = 1000000000
for three_prime in list_of_three_prime:
if three_prime.reference_start < longest_three_prime:
longest_three_prime = three_prime.reference_start
return(longest_three_prime)
def get_latest_end(list_of_three_prime):
longest_three_prime = 0
for three_prime in list_of_three_prime:
if three_prime.reference_end > longest_three_prime:
longest_three_prime = three_prime.reference_end
return(longest_three_prime )
def generate_all_with_three_prime_utrs(gff_in, gene_list, median_utr):
"""
Generate all with three prime utrs.
"""
for genes in gff_in.keys():
use_median=False
try:
list_of_three_prime = gene_list[genes]
except KeyError:
use_median=True
gff_rows = gff_in[genes]
for gff_row in gff_rows:
if not use_median:
read_length = get_longest_three_prime(list_of_three_prime)
if "C" in genes:
# must be rev comp
if use_median:
gff_row.remove_from_start(median_utr)
else:
read_length = get_earliest_start(list_of_three_prime)
gff_row.set_start(read_length)
elif "W" in genes:
# must be rev comp
if use_median:
gff_row.add_to_end(median_utr)
else:
read_length = get_latest_end(list_of_three_prime)
gff_row.set_end(read_length)
# must be forward strand so 3' is at the end.
print(gff_row)
def main():
"""
Create bam input file
"""
parser = argparse.ArgumentParser(description="Create bed file from alignment")
parser.add_argument("bam_file")
parser.add_argument("-g","--gff",dest="gff_file", help="Gff file")
args = parser.parse_args()
gene_list, median_utr = bam_to_bed(args.bam_file)
gff_in = read_gff(args.gff_file)
generate_all_with_three_prime_utrs(gff_in, gene_list, median_utr)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment