Created
June 13, 2024 03:22
-
-
Save pansapiens/3b0c882e853722f77efa98d0ad010a65 to your computer and use it in GitHub Desktop.
Extract mapped reads from BAM, lowercase masking soft-clipping
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
### | |
# bam2fq_softclip.py | |
### | |
# A script for exploring soft-clipping of aligned reads. | |
# Extracts mapped reads from a BAM file as a FASTQ, but encodes 'soft-clipped' regions | |
# as lowercase. Soft-clipped regions can be quickly visualized in the terminal like: | |
# | |
# ./bam2fq_softclip.py aligned.bam | grep --color=always [atcg] |less -R | |
# | |
# There are additional options to output stats for 5' and 3' soft clipping lengths, | |
# including the raw 5'/3' soft-clipped lengths for each read. | |
from typing import Tuple | |
import argparse | |
import logging | |
import pysam | |
import statistics | |
import sys | |
import gzip | |
def soft_clip_to_lowercase(sequence: str, cigar: list) -> str: | |
result = [] | |
seq_index = 0 | |
for (op, length) in cigar: | |
if op == 4: # Soft clipping | |
result.append(sequence[seq_index:seq_index + length].lower()) | |
else: | |
result.append(sequence[seq_index:seq_index + length]) | |
seq_index += length | |
return ''.join(result) | |
def qualities_to_ascii(qualities: list) -> str: | |
return ''.join(chr(q + 33) for q in qualities) | |
def bam_to_custom_fastq(input_bam: str, output_handle) -> Tuple[list, list]: | |
soft_clip_5 = [] | |
soft_clip_3 = [] | |
with pysam.AlignmentFile(input_bam, "rb") as bamfile: | |
for read in bamfile.fetch(): | |
if not read.is_unmapped: | |
sequence = read.query_sequence | |
quality = read.query_qualities | |
name = read.query_name | |
custom_sequence = soft_clip_to_lowercase(sequence, read.cigartuples) | |
quality_ascii = qualities_to_ascii(quality) | |
output_handle.write(f"@{name}\n{custom_sequence}\n+\n{quality_ascii}\n") | |
cigar = read.cigartuples | |
if cigar[0][0] == 4: # Soft-clipped at 5' end | |
soft_clip_5.append(cigar[0][1]) | |
if cigar[-1][0] == 4: # Soft-clipped at 3' end | |
soft_clip_3.append(cigar[-1][1]) | |
return soft_clip_5, soft_clip_3 | |
def summarize_statistics(data): | |
if not data: | |
return {} | |
return { | |
'count': len(data), | |
'mean': statistics.mean(data), | |
'median': statistics.median(data), | |
'min': min(data), | |
'max': max(data), | |
'std_dev': statistics.stdev(data) if len(data) > 1 else 0, | |
} | |
def print_summary(stats, title, output_handle): | |
output_handle.write(f"{title} Summary Statistics:\n") | |
output_handle.write(f"Count: {stats['count']}\n") | |
output_handle.write(f"Mean: {stats['mean']:.2f}\n") | |
output_handle.write(f"Median: {stats['median']}\n") | |
output_handle.write(f"Min: {stats['min']}\n") | |
output_handle.write(f"Max: {stats['max']}\n") | |
output_handle.write(f"Standard Deviation: {stats['std_dev']:.2f}\n") | |
output_handle.write("\n") | |
def write_clip_lengths(soft_clip_5, soft_clip_3, output_clip_lengths): | |
with gzip.open(output_clip_lengths, 'wt') as f: | |
for sc5, sc3 in zip(soft_clip_5, soft_clip_3): | |
f.write(f"{sc5}\t{sc3}\n") | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Convert BAM to custom FASTQ with soft-clipped regions in lowercase, gather statistics, and output soft-clipped lengths.") | |
parser.add_argument("input_bam", help="Input BAM file") | |
parser.add_argument("-o", "--output", help="Output FASTQ file (default: stdout)", default="-") | |
parser.add_argument("--stats-output", help="Output file for statistics (default: stderr)", default=None) | |
parser.add_argument("--output-clip-lengths", help="Gzipped TSV file for soft-clipped lengths", default=None) | |
args = parser.parse_args() | |
logging.basicConfig(level=logging.INFO) | |
if args.output == "-": | |
output_handle = sys.stdout | |
else: | |
output_handle = open(args.output, "w") | |
soft_clip_5, soft_clip_3 = bam_to_custom_fastq(args.input_bam, output_handle) | |
if args.output != "-": | |
output_handle.close() | |
stats_5 = summarize_statistics(soft_clip_5) | |
stats_3 = summarize_statistics(soft_clip_3) | |
if args.stats_output: | |
with open(args.stats_output, "w") as stats_handle: | |
print_summary(stats_5, "5' Soft-Clipped", stats_handle) | |
print_summary(stats_3, "3' Soft-Clipped", stats_handle) | |
else: | |
print_summary(stats_5, "5' Soft-Clipped", sys.stderr) | |
print_summary(stats_3, "3' Soft-Clipped", sys.stderr) | |
if args.output_clip_lengths: | |
write_clip_lengths(soft_clip_5, soft_clip_3, args.output_clip_lengths) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment