wdecoster/PerBaseSequenceContentandQuality.py

## PerBaseSequenceContentandQuality.py
# wdecoster

import os
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import argparse
from Bio import SeqIO

parser = argparse.ArgumentParser(
		description="Investigate overrepresented nucleotides in reads of median length.")
parser.add_argument("--outdir",
		help="Specify directory in which output has to be created.",
		default=".")
parser.add_argument("fastq",
		help="Reads data in fastq format.")
args = parser.parse_args()


def main():
	lengths = getLengths(args.fastq)
	sizeRange = LengthHistogram(lengths)
	fq = getBin(args.fastq, sizeRange)
	plotNucleotideDiversity2([dat[0] for dat in fq])
	plotQual([dat[1] for dat in fq])

def getLengths(fastq):
	'''	Loop over the fastq file, extract length of sequences'''
	return np.array([len(record) for record in SeqIO.parse(fastq, "fastq")])


def LengthHistogram(lengths):
	''' Create a histogram, and return the number of reads per bin and bin edges'''
	plt.hist(lengths, bins='auto')
	plt.savefig(os.path.join(args.outdir, "LengthHistogram.png"), format='png', dpi=1000)
	plt.close("all")
	hist, bin_edges = np.histogram(lengths, bins='auto')
	maxindex = np.argmax(hist)
	return((bin_edges[maxindex], bin_edges[maxindex + 1]))


def getBin(fq, sizeRange):
	''' Loop over the fastq file, extract sequence and quality scores in lists'''
	return [(list(record.seq), list(record.letter_annotations["phred_quality"])) for record in SeqIO.parse(fq, "fastq") if sizeRange[0] < len(record) < sizeRange[1]]


def plotNucleotideDiversity(fqlists):
	'''
	Create a FastQC-like "Per base sequence content" plot
	Plot fraction of nucleotides per position
	zip will stop when shortest read is exhausted
	'''
	numreads = len(fqlists)
	sns.set_style("darkgrid")
	plt.plot(np.array([position.count('A') / numreads for position in zip(*fqlists)]), 'green')
	plt.plot(np.array([position.count('T') / numreads for position in zip(*fqlists)]), 'red')
	plt.plot(np.array([position.count('G') / numreads for position in zip(*fqlists)]), 'black')
	plt.plot(np.array([position.count('C') / numreads for position in zip(*fqlists)]), 'blue')
	plt.legend(['A', 'T', 'G', 'C'], loc='upper right')
	plt.xlabel('Position in read')
	plt.ylabel('Per base sequence content')
	plt.savefig(os.path.join(args.outdir, "PerBaseSequenceContent.png"), format='png', dpi=1000)
	plt.close("all")


def plotQual(quallist):
	'''
	Create a FastQC-like "Per base sequence quality" plot
	Plot average quality per position
	zip will stop when shortest read is exhausted
	'''
	sns.set_style("darkgrid")
	plt.plot(np.array([np.mean(position) for position in zip(*quallist)]), 'red')
	plt.xlabel('Position in read')
	plt.ylabel('Per base sequence quality')
	plt.savefig(os.path.join(args.outdir, "PerBaseSequenceQuality.png"), format='png', dpi=1000)
	plt.close("all")

if __name__ == "__main__":
	main()
	# wdecoster

	import os
	import seaborn as sns
	import numpy as np
	import matplotlib.pyplot as plt
	import argparse
	from Bio import SeqIO

	parser = argparse.ArgumentParser(
	description="Investigate overrepresented nucleotides in reads of median length.")
	parser.add_argument("--outdir",
	help="Specify directory in which output has to be created.",
	default=".")
	parser.add_argument("fastq",
	help="Reads data in fastq format.")
	args = parser.parse_args()


	def main():
	lengths = getLengths(args.fastq)
	sizeRange = LengthHistogram(lengths)
	fq = getBin(args.fastq, sizeRange)
	plotNucleotideDiversity2([dat[0] for dat in fq])
	plotQual([dat[1] for dat in fq])

	def getLengths(fastq):
	''' Loop over the fastq file, extract length of sequences'''
	return np.array([len(record) for record in SeqIO.parse(fastq, "fastq")])


	def LengthHistogram(lengths):
	''' Create a histogram, and return the number of reads per bin and bin edges'''
	plt.hist(lengths, bins='auto')
	plt.savefig(os.path.join(args.outdir, "LengthHistogram.png"), format='png', dpi=1000)
	plt.close("all")
	hist, bin_edges = np.histogram(lengths, bins='auto')
	maxindex = np.argmax(hist)
	return((bin_edges[maxindex], bin_edges[maxindex + 1]))


	def getBin(fq, sizeRange):
	''' Loop over the fastq file, extract sequence and quality scores in lists'''
	return [(list(record.seq), list(record.letter_annotations["phred_quality"])) for record in SeqIO.parse(fq, "fastq") if sizeRange[0] < len(record) < sizeRange[1]]


	def plotNucleotideDiversity(fqlists):
	'''
	Create a FastQC-like "Per base sequence content" plot
	Plot fraction of nucleotides per position
	zip will stop when shortest read is exhausted
	'''
	numreads = len(fqlists)
	sns.set_style("darkgrid")
	plt.plot(np.array([position.count('A') / numreads for position in zip(*fqlists)]), 'green')
	plt.plot(np.array([position.count('T') / numreads for position in zip(*fqlists)]), 'red')
	plt.plot(np.array([position.count('G') / numreads for position in zip(*fqlists)]), 'black')
	plt.plot(np.array([position.count('C') / numreads for position in zip(*fqlists)]), 'blue')
	plt.legend(['A', 'T', 'G', 'C'], loc='upper right')
	plt.xlabel('Position in read')
	plt.ylabel('Per base sequence content')
	plt.savefig(os.path.join(args.outdir, "PerBaseSequenceContent.png"), format='png', dpi=1000)
	plt.close("all")


	def plotQual(quallist):
	'''
	Create a FastQC-like "Per base sequence quality" plot
	Plot average quality per position
	zip will stop when shortest read is exhausted
	'''
	sns.set_style("darkgrid")
	plt.plot(np.array([np.mean(position) for position in zip(*quallist)]), 'red')
	plt.xlabel('Position in read')
	plt.ylabel('Per base sequence quality')
	plt.savefig(os.path.join(args.outdir, "PerBaseSequenceQuality.png"), format='png', dpi=1000)
	plt.close("all")

	if __name__ == "__main__":
	main()