florianzwagemaker/filter_reference_from_alignment.py

## filter_reference_from_alignment.py
import sys

# please install the following requirements: pandas, pysam, biopython
import pandas as pd
import pysam
from Bio import SeqIO


# python filter_reference_from_alignment.py input.bam references.fasta mapping_statistics.csv filtered_references.fasta
inputbam, inputref, outputcsv, outputfasta = sys.argv[1:]

# Open BAM file
bamfile = pysam.AlignmentFile(inputbam, "rb")

# Loop over reference headers and count mapped reads, mismatches, and sequence identity
data = []
for ref in bamfile.references:
    mapped_reads = 0
    total_mismatches = 0
    total_identity = 0
    for read in bamfile.fetch(ref):
        if not read.is_unmapped:
            mapped_reads += 1
            total_mismatches += read.get_tag("NM")
            total_identity += 1 - (read.get_tag("NM") / read.query_alignment_length)
    if mapped_reads > 0:
        avg_mismatches = total_mismatches / mapped_reads
        avg_identity = total_identity / mapped_reads
    else:
        avg_mismatches = 0
        avg_identity = 0
    data.append(
        {
            "Reference": ref,
            "Mapped Reads": mapped_reads,
            "Avg. Mismatches per Read": avg_mismatches,
            "Avg. Sequence Identity": avg_identity,
        }
    )

# Create pandas dataframe with the basic mapping statistics
df = pd.DataFrame(data)

# output to csv file
df.to_csv(outputcsv)


# sort by "Mapped Reads" column
df = df.sort_values(by="Mapped Reads", ascending=False)

#keep only the rows where "Mapped Reads" has a value greater than 20
df = df[df["Mapped Reads"] > 20]

# get the reference names
refnames = df["Reference"].values

# read the reference fasta file and filter for the reference names
records = []
for record in SeqIO.parse(inputref, "fasta"):
    if record.id in refnames:
        records.append(record)

# write the filtered references to a new fasta file
SeqIO.write(records, outputfasta, "fasta")
	import sys

	# please install the following requirements: pandas, pysam, biopython
	import pandas as pd
	import pysam
	from Bio import SeqIO


	# python filter_reference_from_alignment.py input.bam references.fasta mapping_statistics.csv filtered_references.fasta
	inputbam, inputref, outputcsv, outputfasta = sys.argv[1:]

	# Open BAM file
	bamfile = pysam.AlignmentFile(inputbam, "rb")

	# Loop over reference headers and count mapped reads, mismatches, and sequence identity
	data = []
	for ref in bamfile.references:
	mapped_reads = 0
	total_mismatches = 0
	total_identity = 0
	for read in bamfile.fetch(ref):
	if not read.is_unmapped:
	mapped_reads += 1
	total_mismatches += read.get_tag("NM")
	total_identity += 1 - (read.get_tag("NM") / read.query_alignment_length)
	if mapped_reads > 0:
	avg_mismatches = total_mismatches / mapped_reads
	avg_identity = total_identity / mapped_reads
	else:
	avg_mismatches = 0
	avg_identity = 0
	data.append(
	{
	"Reference": ref,
	"Mapped Reads": mapped_reads,
	"Avg. Mismatches per Read": avg_mismatches,
	"Avg. Sequence Identity": avg_identity,
	}
	)

	# Create pandas dataframe with the basic mapping statistics
	df = pd.DataFrame(data)

	# output to csv file
	df.to_csv(outputcsv)


	# sort by "Mapped Reads" column
	df = df.sort_values(by="Mapped Reads", ascending=False)

	#keep only the rows where "Mapped Reads" has a value greater than 20
	df = df[df["Mapped Reads"] > 20]

	# get the reference names
	refnames = df["Reference"].values

	# read the reference fasta file and filter for the reference names
	records = []
	for record in SeqIO.parse(inputref, "fasta"):
	if record.id in refnames:
	records.append(record)

	# write the filtered references to a new fasta file
	SeqIO.write(records, outputfasta, "fasta")