Skip to content

Instantly share code, notes, and snippets.

@wdecoster
Created April 24, 2017 21:05
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wdecoster/7cad6080950fa1e3ae3eaeeac4f6ae4d to your computer and use it in GitHub Desktop.
Save wdecoster/7cad6080950fa1e3ae3eaeeac4f6ae4d to your computer and use it in GitHub Desktop.
# wdecoster
import os
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import argparse
from Bio import SeqIO
parser = argparse.ArgumentParser(
description="Investigate overrepresented nucleotides in reads of median length.")
parser.add_argument("--outdir",
help="Specify directory in which output has to be created.",
default=".")
parser.add_argument("fastq",
help="Reads data in fastq format.")
args = parser.parse_args()
def main():
lengths = getLengths(args.fastq)
sizeRange = LengthHistogram(lengths)
fq = getBin(args.fastq, sizeRange)
plotNucleotideDiversity2([dat[0] for dat in fq])
plotQual([dat[1] for dat in fq])
def getLengths(fastq):
''' Loop over the fastq file, extract length of sequences'''
return np.array([len(record) for record in SeqIO.parse(fastq, "fastq")])
def LengthHistogram(lengths):
''' Create a histogram, and return the number of reads per bin and bin edges'''
plt.hist(lengths, bins='auto')
plt.savefig(os.path.join(args.outdir, "LengthHistogram.png"), format='png', dpi=1000)
plt.close("all")
hist, bin_edges = np.histogram(lengths, bins='auto')
maxindex = np.argmax(hist)
return((bin_edges[maxindex], bin_edges[maxindex + 1]))
def getBin(fq, sizeRange):
''' Loop over the fastq file, extract sequence and quality scores in lists'''
return [(list(record.seq), list(record.letter_annotations["phred_quality"])) for record in SeqIO.parse(fq, "fastq") if sizeRange[0] < len(record) < sizeRange[1]]
def plotNucleotideDiversity(fqlists):
'''
Create a FastQC-like "Per base sequence content" plot
Plot fraction of nucleotides per position
zip will stop when shortest read is exhausted
'''
numreads = len(fqlists)
sns.set_style("darkgrid")
plt.plot(np.array([position.count('A') / numreads for position in zip(*fqlists)]), 'green')
plt.plot(np.array([position.count('T') / numreads for position in zip(*fqlists)]), 'red')
plt.plot(np.array([position.count('G') / numreads for position in zip(*fqlists)]), 'black')
plt.plot(np.array([position.count('C') / numreads for position in zip(*fqlists)]), 'blue')
plt.legend(['A', 'T', 'G', 'C'], loc='upper right')
plt.xlabel('Position in read')
plt.ylabel('Per base sequence content')
plt.savefig(os.path.join(args.outdir, "PerBaseSequenceContent.png"), format='png', dpi=1000)
plt.close("all")
def plotQual(quallist):
'''
Create a FastQC-like "Per base sequence quality" plot
Plot average quality per position
zip will stop when shortest read is exhausted
'''
sns.set_style("darkgrid")
plt.plot(np.array([np.mean(position) for position in zip(*quallist)]), 'red')
plt.xlabel('Position in read')
plt.ylabel('Per base sequence quality')
plt.savefig(os.path.join(args.outdir, "PerBaseSequenceQuality.png"), format='png', dpi=1000)
plt.close("all")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment