Skip to content

Instantly share code, notes, and snippets.

@TTTPOB
Last active December 30, 2021 03:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TTTPOB/5eb874c03aff6cfb924e1b38ba8031fb to your computer and use it in GitHub Desktop.
Save TTTPOB/5eb874c03aff6cfb924e1b38ba8031fb to your computer and use it in GitHub Desktop.
shift and extend paired end bam into a bed file
#!/usr/bin/env python3
import pysam
from collections import defaultdict
import subprocess
import click
from pathlib import Path
import pandas as pd
import numpy as np
from sys import stderr
def read_pari_generator(bam: pysam.AlignmentFile, region_string=None):
read_dict = defaultdict(lambda: [None, None])
for read in bam.fetch(region=region_string):
if not read.is_proper_pair or read.is_secondary or read.is_supplementary:
continue
qname = read.query_name
if qname not in read_dict:
if read.is_read1:
read_dict[qname][0] = read
else:
read_dict[qname][1] = read
else:
if read.is_read1:
yield read, read_dict[qname][1]
else:
yield read_dict[qname][0], read
del read_dict[qname]
@click.command()
@click.option("--input-bam", "-i", help="Input BAM file", required=True)
@click.option("--extend-size", "-e", help="Extend size", default=150, type=int)
def main(input_bam, extend_size):
# get bam file stem name, which means not containing the extension and path
prefix = Path(input_bam).stem
# init the pysam generator
bam = pysam.AlignmentFile(input_bam, "rb")
# iterate through the bam file, get mate pair and check if the have mapped to the same chrom
# if not, skip, if yes, get the center point of the pair
chrom_list = []
center_list = []
for r1, r2 in read_pari_generator(bam):
r1_ref = r1.reference_name
r2_ref = r2.reference_name
if r1_ref != r2_ref:
continue
r1_start = r1.reference_start
r2_end = r2.reference_end
center = (r1_start + r2_end) / 2
chrom_list.append(r1_ref)
center_list.append(center)
# extend the center point by the extend size, half of the extend size is added to the left and right
# the center point is rounded to the nearest integer
left_list = [center - extend_size for center in center_list]
right_list = [center + extend_size for center in center_list]
left_list = [round(center) for center in left_list]
right_list = [round(center) for center in right_list]
# construct a df using results produced above
bed = pd.DataFrame({"chrom": chrom_list, "left": left_list, "right": right_list})
bed["left"] = bed["left"].astype(np.uint32)
bed["right"] = bed["right"].astype(np.uint32)
# write the df to a bed file named prefix.bed
# tab delimited
bed.to_csv(f"{prefix}.bed", sep="\t", index=False, header=False)
# sort the bed file
sort_command = f"sort -k1,1 -k2,2n {prefix}.bed > {prefix}.sorted.bed"
print(sort_command, file=stderr)
subprocess.run(sort_command, shell=True)
# overwrite the sorted bed file to the original bed file
mv_command = f"mv {prefix}.sorted.bed {prefix}.bed"
print(mv_command, file=stderr)
subprocess.run(mv_command, shell=True)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment