Skip to content

Instantly share code, notes, and snippets.

@wdecoster
Last active August 7, 2019 17:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wdecoster/a0ca604306b5778a7167f705c597ee38 to your computer and use it in GitHub Desktop.
Save wdecoster/a0ca604306b5778a7167f705c597ee38 to your computer and use it in GitHub Desktop.
import sys
import os
import re
import seaborn as sns
import pandas as pd
import numpy as np
from multiprocessing import Pool
from scipy import stats
import matplotlib.pyplot as plt
import pysam
def processBam(bam):
'''
Processing function: calls pool of worker functions
to extract from a bam file two definitions of the edit distances to the reference genome scaled by read length
Returned in a pandas DataFrame
'''
samfile = pysam.AlignmentFile(bam, "rb")
if not samfile.has_index():
pysam.index(bam)
samfile = pysam.AlignmentFile(bam, "rb") # Need to reload the samfile after creating index
chromosomes = samfile.references
datadf = pd.DataFrame()
pool = Pool(processes=12)
params = zip([bam]*len(chromosomes), chromosomes)
try:
output = [results for results in pool.imap(extractFromBam, params)]
except KeyboardInterrupt:
print("Terminating worker threads")
pool.terminate()
pool.join()
sys.exit()
datadf["editDistancesNM"] = np.array([x for y in [elem[0] for elem in output] for x in y])
datadf["editDistancesMD"] = np.array([x for y in [elem[1] for elem in output] for x in y])
return datadf
def extractFromBam(params):
'''
Worker function per chromosome
loop over a bam file and create tuple with lists containing metrics:
two definitions of the edit distances to the reference genome scaled by aligned read length
'''
bam, chromosome = params
samfile = pysam.AlignmentFile(bam, "rb")
editDistancesNM = []
editDistancesMD = []
for read in samfile.fetch(reference=chromosome, multiple_iterators=True):
editDistancesNM.append(read.get_tag("NM")/read.query_alignment_length)
editDistancesMD.append(
(sum([len(item) for item in re.split('[0-9^]', read.get_tag("MD"))]) + # Parse MD string to get mismatches/deletions
sum([item[1] for item in read.cigartuples if item[0] == 1])) # Parse cigar to get insertions
/read.query_alignment_length)
return (editDistancesNM, editDistancesMD)
def makePlot(datadf):
try:
plot = sns.jointplot(
x='editDistancesNM',
y='editDistancesMD',
data=datadf,
kind="kde",
color="#4CB391",
stat_func=stats.pearsonr,
space=0,
size=10)
plot.savefig('EditDistancesCompared_kde.png', format='png', dpi=1000)
except: # throws an error with perfect correlation! :-D
pass
plot = sns.jointplot(
x='editDistancesNM',
y='editDistancesMD',
data=datadf,
kind="scatter",
color="#4CB391",
stat_func=stats.pearsonr,
space=0,
joint_kws={"s": 1},
size=10)
plot.savefig('EditDistancesCompared_scatter.png', format='png', dpi=1000)
df = processBam(sys.argv[1])
makePlot(df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment