Skip to content

Instantly share code, notes, and snippets.

@pansapiens
Created June 13, 2024 03:22
Show Gist options
  • Save pansapiens/3b0c882e853722f77efa98d0ad010a65 to your computer and use it in GitHub Desktop.
Save pansapiens/3b0c882e853722f77efa98d0ad010a65 to your computer and use it in GitHub Desktop.
Extract mapped reads from BAM, lowercase masking soft-clipping
#!/usr/bin/env python
###
# bam2fq_softclip.py
###
# A script for exploring soft-clipping of aligned reads.
# Extracts mapped reads from a BAM file as a FASTQ, but encodes 'soft-clipped' regions
# as lowercase. Soft-clipped regions can be quickly visualized in the terminal like:
#
# ./bam2fq_softclip.py aligned.bam | grep --color=always [atcg] |less -R
#
# There are additional options to output stats for 5' and 3' soft clipping lengths,
# including the raw 5'/3' soft-clipped lengths for each read.
from typing import Tuple
import argparse
import logging
import pysam
import statistics
import sys
import gzip
def soft_clip_to_lowercase(sequence: str, cigar: list) -> str:
result = []
seq_index = 0
for (op, length) in cigar:
if op == 4: # Soft clipping
result.append(sequence[seq_index:seq_index + length].lower())
else:
result.append(sequence[seq_index:seq_index + length])
seq_index += length
return ''.join(result)
def qualities_to_ascii(qualities: list) -> str:
return ''.join(chr(q + 33) for q in qualities)
def bam_to_custom_fastq(input_bam: str, output_handle) -> Tuple[list, list]:
soft_clip_5 = []
soft_clip_3 = []
with pysam.AlignmentFile(input_bam, "rb") as bamfile:
for read in bamfile.fetch():
if not read.is_unmapped:
sequence = read.query_sequence
quality = read.query_qualities
name = read.query_name
custom_sequence = soft_clip_to_lowercase(sequence, read.cigartuples)
quality_ascii = qualities_to_ascii(quality)
output_handle.write(f"@{name}\n{custom_sequence}\n+\n{quality_ascii}\n")
cigar = read.cigartuples
if cigar[0][0] == 4: # Soft-clipped at 5' end
soft_clip_5.append(cigar[0][1])
if cigar[-1][0] == 4: # Soft-clipped at 3' end
soft_clip_3.append(cigar[-1][1])
return soft_clip_5, soft_clip_3
def summarize_statistics(data):
if not data:
return {}
return {
'count': len(data),
'mean': statistics.mean(data),
'median': statistics.median(data),
'min': min(data),
'max': max(data),
'std_dev': statistics.stdev(data) if len(data) > 1 else 0,
}
def print_summary(stats, title, output_handle):
output_handle.write(f"{title} Summary Statistics:\n")
output_handle.write(f"Count: {stats['count']}\n")
output_handle.write(f"Mean: {stats['mean']:.2f}\n")
output_handle.write(f"Median: {stats['median']}\n")
output_handle.write(f"Min: {stats['min']}\n")
output_handle.write(f"Max: {stats['max']}\n")
output_handle.write(f"Standard Deviation: {stats['std_dev']:.2f}\n")
output_handle.write("\n")
def write_clip_lengths(soft_clip_5, soft_clip_3, output_clip_lengths):
with gzip.open(output_clip_lengths, 'wt') as f:
for sc5, sc3 in zip(soft_clip_5, soft_clip_3):
f.write(f"{sc5}\t{sc3}\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Convert BAM to custom FASTQ with soft-clipped regions in lowercase, gather statistics, and output soft-clipped lengths.")
parser.add_argument("input_bam", help="Input BAM file")
parser.add_argument("-o", "--output", help="Output FASTQ file (default: stdout)", default="-")
parser.add_argument("--stats-output", help="Output file for statistics (default: stderr)", default=None)
parser.add_argument("--output-clip-lengths", help="Gzipped TSV file for soft-clipped lengths", default=None)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
if args.output == "-":
output_handle = sys.stdout
else:
output_handle = open(args.output, "w")
soft_clip_5, soft_clip_3 = bam_to_custom_fastq(args.input_bam, output_handle)
if args.output != "-":
output_handle.close()
stats_5 = summarize_statistics(soft_clip_5)
stats_3 = summarize_statistics(soft_clip_3)
if args.stats_output:
with open(args.stats_output, "w") as stats_handle:
print_summary(stats_5, "5' Soft-Clipped", stats_handle)
print_summary(stats_3, "3' Soft-Clipped", stats_handle)
else:
print_summary(stats_5, "5' Soft-Clipped", sys.stderr)
print_summary(stats_3, "3' Soft-Clipped", sys.stderr)
if args.output_clip_lengths:
write_clip_lengths(soft_clip_5, soft_clip_3, args.output_clip_lengths)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment