Last active
December 30, 2021 03:27
-
-
Save TTTPOB/5eb874c03aff6cfb924e1b38ba8031fb to your computer and use it in GitHub Desktop.
shift and extend paired end bam into a bed file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import pysam | |
from collections import defaultdict | |
import subprocess | |
import click | |
from pathlib import Path | |
import pandas as pd | |
import numpy as np | |
from sys import stderr | |
def read_pari_generator(bam: pysam.AlignmentFile, region_string=None): | |
read_dict = defaultdict(lambda: [None, None]) | |
for read in bam.fetch(region=region_string): | |
if not read.is_proper_pair or read.is_secondary or read.is_supplementary: | |
continue | |
qname = read.query_name | |
if qname not in read_dict: | |
if read.is_read1: | |
read_dict[qname][0] = read | |
else: | |
read_dict[qname][1] = read | |
else: | |
if read.is_read1: | |
yield read, read_dict[qname][1] | |
else: | |
yield read_dict[qname][0], read | |
del read_dict[qname] | |
@click.command() | |
@click.option("--input-bam", "-i", help="Input BAM file", required=True) | |
@click.option("--extend-size", "-e", help="Extend size", default=150, type=int) | |
def main(input_bam, extend_size): | |
# get bam file stem name, which means not containing the extension and path | |
prefix = Path(input_bam).stem | |
# init the pysam generator | |
bam = pysam.AlignmentFile(input_bam, "rb") | |
# iterate through the bam file, get mate pair and check if the have mapped to the same chrom | |
# if not, skip, if yes, get the center point of the pair | |
chrom_list = [] | |
center_list = [] | |
for r1, r2 in read_pari_generator(bam): | |
r1_ref = r1.reference_name | |
r2_ref = r2.reference_name | |
if r1_ref != r2_ref: | |
continue | |
r1_start = r1.reference_start | |
r2_end = r2.reference_end | |
center = (r1_start + r2_end) / 2 | |
chrom_list.append(r1_ref) | |
center_list.append(center) | |
# extend the center point by the extend size, half of the extend size is added to the left and right | |
# the center point is rounded to the nearest integer | |
left_list = [center - extend_size for center in center_list] | |
right_list = [center + extend_size for center in center_list] | |
left_list = [round(center) for center in left_list] | |
right_list = [round(center) for center in right_list] | |
# construct a df using results produced above | |
bed = pd.DataFrame({"chrom": chrom_list, "left": left_list, "right": right_list}) | |
bed["left"] = bed["left"].astype(np.uint32) | |
bed["right"] = bed["right"].astype(np.uint32) | |
# write the df to a bed file named prefix.bed | |
# tab delimited | |
bed.to_csv(f"{prefix}.bed", sep="\t", index=False, header=False) | |
# sort the bed file | |
sort_command = f"sort -k1,1 -k2,2n {prefix}.bed > {prefix}.sorted.bed" | |
print(sort_command, file=stderr) | |
subprocess.run(sort_command, shell=True) | |
# overwrite the sorted bed file to the original bed file | |
mv_command = f"mv {prefix}.sorted.bed {prefix}.bed" | |
print(mv_command, file=stderr) | |
subprocess.run(mv_command, shell=True) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment