Created
July 4, 2024 13:56
-
-
Save florianzwagemaker/6459c204eb2ff1166bcae6ffe2ca27ec to your computer and use it in GitHub Desktop.
A quick script to filter references based on amount of mapped reads in a bamfile. Includes only references that have more than 20 mapped reads.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
# please install the following requirements: pandas, pysam, biopython | |
import pandas as pd | |
import pysam | |
from Bio import SeqIO | |
# python filter_reference_from_alignment.py input.bam references.fasta mapping_statistics.csv filtered_references.fasta | |
inputbam, inputref, outputcsv, outputfasta = sys.argv[1:] | |
# Open BAM file | |
bamfile = pysam.AlignmentFile(inputbam, "rb") | |
# Loop over reference headers and count mapped reads, mismatches, and sequence identity | |
data = [] | |
for ref in bamfile.references: | |
mapped_reads = 0 | |
total_mismatches = 0 | |
total_identity = 0 | |
for read in bamfile.fetch(ref): | |
if not read.is_unmapped: | |
mapped_reads += 1 | |
total_mismatches += read.get_tag("NM") | |
total_identity += 1 - (read.get_tag("NM") / read.query_alignment_length) | |
if mapped_reads > 0: | |
avg_mismatches = total_mismatches / mapped_reads | |
avg_identity = total_identity / mapped_reads | |
else: | |
avg_mismatches = 0 | |
avg_identity = 0 | |
data.append( | |
{ | |
"Reference": ref, | |
"Mapped Reads": mapped_reads, | |
"Avg. Mismatches per Read": avg_mismatches, | |
"Avg. Sequence Identity": avg_identity, | |
} | |
) | |
# Create pandas dataframe with the basic mapping statistics | |
df = pd.DataFrame(data) | |
# output to csv file | |
df.to_csv(outputcsv) | |
# sort by "Mapped Reads" column | |
df = df.sort_values(by="Mapped Reads", ascending=False) | |
#keep only the rows where "Mapped Reads" has a value greater than 20 | |
df = df[df["Mapped Reads"] > 20] | |
# get the reference names | |
refnames = df["Reference"].values | |
# read the reference fasta file and filter for the reference names | |
records = [] | |
for record in SeqIO.parse(inputref, "fasta"): | |
if record.id in refnames: | |
records.append(record) | |
# write the filtered references to a new fasta file | |
SeqIO.write(records, outputfasta, "fasta") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment