TTTPOB/shift_extend.py

## shift_extend.py
#!/usr/bin/env python3
import pysam
from collections import defaultdict
import subprocess
import click
from pathlib import Path
import pandas as pd
import numpy as np
from sys import stderr


def read_pari_generator(bam: pysam.AlignmentFile, region_string=None):
    read_dict = defaultdict(lambda: [None, None])
    for read in bam.fetch(region=region_string):
        if not read.is_proper_pair or read.is_secondary or read.is_supplementary:
            continue
        qname = read.query_name
        if qname not in read_dict:
            if read.is_read1:
                read_dict[qname][0] = read
            else:
                read_dict[qname][1] = read
        else:
            if read.is_read1:
                yield read, read_dict[qname][1]
            else:
                yield read_dict[qname][0], read
            del read_dict[qname]


@click.command()
@click.option("--input-bam", "-i", help="Input BAM file", required=True)
@click.option("--extend-size", "-e", help="Extend size", default=150, type=int)
def main(input_bam, extend_size):
    # get bam file stem name, which means not containing the extension and path
    prefix = Path(input_bam).stem

    # init the pysam generator
    bam = pysam.AlignmentFile(input_bam, "rb")

    # iterate through the bam file, get mate pair and check if the have mapped to the same chrom
    # if not, skip, if yes, get the center point of the pair
    chrom_list = []
    center_list = []
    for r1, r2 in read_pari_generator(bam):
        r1_ref = r1.reference_name
        r2_ref = r2.reference_name
        if r1_ref != r2_ref:
            continue
        r1_start = r1.reference_start
        r2_end = r2.reference_end
        center = (r1_start + r2_end) / 2
        chrom_list.append(r1_ref)
        center_list.append(center)

    # extend the center point by the extend size, half of the extend size is added to the left and right
    # the center point is rounded to the nearest integer
    left_list = [center - extend_size for center in center_list]
    right_list = [center + extend_size for center in center_list]
    left_list = [round(center) for center in left_list]
    right_list = [round(center) for center in right_list]

    # construct a df using results produced above
    bed = pd.DataFrame({"chrom": chrom_list, "left": left_list, "right": right_list})
    bed["left"] = bed["left"].astype(np.uint32)
    bed["right"] = bed["right"].astype(np.uint32)

    # write the df to a bed file named prefix.bed
    # tab delimited
    bed.to_csv(f"{prefix}.bed", sep="\t", index=False, header=False)

    # sort the bed file
    sort_command = f"sort -k1,1 -k2,2n {prefix}.bed > {prefix}.sorted.bed"
    print(sort_command, file=stderr)
    subprocess.run(sort_command, shell=True)

    # overwrite the sorted bed file to the original bed file
    mv_command = f"mv {prefix}.sorted.bed {prefix}.bed"
    print(mv_command, file=stderr)
    subprocess.run(mv_command, shell=True)


if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	import pysam
	from collections import defaultdict
	import subprocess
	import click
	from pathlib import Path
	import pandas as pd
	import numpy as np
	from sys import stderr


	def read_pari_generator(bam: pysam.AlignmentFile, region_string=None):
	read_dict = defaultdict(lambda: [None, None])
	for read in bam.fetch(region=region_string):
	if not read.is_proper_pair or read.is_secondary or read.is_supplementary:
	continue
	qname = read.query_name
	if qname not in read_dict:
	if read.is_read1:
	read_dict[qname][0] = read
	else:
	read_dict[qname][1] = read
	else:
	if read.is_read1:
	yield read, read_dict[qname][1]
	else:
	yield read_dict[qname][0], read
	del read_dict[qname]


	@click.command()
	@click.option("--input-bam", "-i", help="Input BAM file", required=True)
	@click.option("--extend-size", "-e", help="Extend size", default=150, type=int)
	def main(input_bam, extend_size):
	# get bam file stem name, which means not containing the extension and path
	prefix = Path(input_bam).stem

	# init the pysam generator
	bam = pysam.AlignmentFile(input_bam, "rb")

	# iterate through the bam file, get mate pair and check if the have mapped to the same chrom
	# if not, skip, if yes, get the center point of the pair
	chrom_list = []
	center_list = []
	for r1, r2 in read_pari_generator(bam):
	r1_ref = r1.reference_name
	r2_ref = r2.reference_name
	if r1_ref != r2_ref:
	continue
	r1_start = r1.reference_start
	r2_end = r2.reference_end
	center = (r1_start + r2_end) / 2
	chrom_list.append(r1_ref)
	center_list.append(center)

	# extend the center point by the extend size, half of the extend size is added to the left and right
	# the center point is rounded to the nearest integer
	left_list = [center - extend_size for center in center_list]
	right_list = [center + extend_size for center in center_list]
	left_list = [round(center) for center in left_list]
	right_list = [round(center) for center in right_list]

	# construct a df using results produced above
	bed = pd.DataFrame({"chrom": chrom_list, "left": left_list, "right": right_list})
	bed["left"] = bed["left"].astype(np.uint32)
	bed["right"] = bed["right"].astype(np.uint32)

	# write the df to a bed file named prefix.bed
	# tab delimited
	bed.to_csv(f"{prefix}.bed", sep="\t", index=False, header=False)

	# sort the bed file
	sort_command = f"sort -k1,1 -k2,2n {prefix}.bed > {prefix}.sorted.bed"
	print(sort_command, file=stderr)
	subprocess.run(sort_command, shell=True)

	# overwrite the sorted bed file to the original bed file
	mv_command = f"mv {prefix}.sorted.bed {prefix}.bed"
	print(mv_command, file=stderr)
	subprocess.run(mv_command, shell=True)


	if __name__ == "__main__":
	main()