Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save florianzwagemaker/6459c204eb2ff1166bcae6ffe2ca27ec to your computer and use it in GitHub Desktop.
Save florianzwagemaker/6459c204eb2ff1166bcae6ffe2ca27ec to your computer and use it in GitHub Desktop.
A quick script to filter references based on amount of mapped reads in a bamfile. Includes only references that have more than 20 mapped reads.
import sys
# please install the following requirements: pandas, pysam, biopython
import pandas as pd
import pysam
from Bio import SeqIO
# python filter_reference_from_alignment.py input.bam references.fasta mapping_statistics.csv filtered_references.fasta
inputbam, inputref, outputcsv, outputfasta = sys.argv[1:]
# Open BAM file
bamfile = pysam.AlignmentFile(inputbam, "rb")
# Loop over reference headers and count mapped reads, mismatches, and sequence identity
data = []
for ref in bamfile.references:
mapped_reads = 0
total_mismatches = 0
total_identity = 0
for read in bamfile.fetch(ref):
if not read.is_unmapped:
mapped_reads += 1
total_mismatches += read.get_tag("NM")
total_identity += 1 - (read.get_tag("NM") / read.query_alignment_length)
if mapped_reads > 0:
avg_mismatches = total_mismatches / mapped_reads
avg_identity = total_identity / mapped_reads
else:
avg_mismatches = 0
avg_identity = 0
data.append(
{
"Reference": ref,
"Mapped Reads": mapped_reads,
"Avg. Mismatches per Read": avg_mismatches,
"Avg. Sequence Identity": avg_identity,
}
)
# Create pandas dataframe with the basic mapping statistics
df = pd.DataFrame(data)
# output to csv file
df.to_csv(outputcsv)
# sort by "Mapped Reads" column
df = df.sort_values(by="Mapped Reads", ascending=False)
#keep only the rows where "Mapped Reads" has a value greater than 20
df = df[df["Mapped Reads"] > 20]
# get the reference names
refnames = df["Reference"].values
# read the reference fasta file and filter for the reference names
records = []
for record in SeqIO.parse(inputref, "fasta"):
if record.id in refnames:
records.append(record)
# write the filtered references to a new fasta file
SeqIO.write(records, outputfasta, "fasta")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment