Created
April 24, 2017 21:05
-
-
Save wdecoster/7cad6080950fa1e3ae3eaeeac4f6ae4d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# wdecoster | |
import os | |
import seaborn as sns | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import argparse | |
from Bio import SeqIO | |
parser = argparse.ArgumentParser( | |
description="Investigate overrepresented nucleotides in reads of median length.") | |
parser.add_argument("--outdir", | |
help="Specify directory in which output has to be created.", | |
default=".") | |
parser.add_argument("fastq", | |
help="Reads data in fastq format.") | |
args = parser.parse_args() | |
def main(): | |
lengths = getLengths(args.fastq) | |
sizeRange = LengthHistogram(lengths) | |
fq = getBin(args.fastq, sizeRange) | |
plotNucleotideDiversity2([dat[0] for dat in fq]) | |
plotQual([dat[1] for dat in fq]) | |
def getLengths(fastq): | |
''' Loop over the fastq file, extract length of sequences''' | |
return np.array([len(record) for record in SeqIO.parse(fastq, "fastq")]) | |
def LengthHistogram(lengths): | |
''' Create a histogram, and return the number of reads per bin and bin edges''' | |
plt.hist(lengths, bins='auto') | |
plt.savefig(os.path.join(args.outdir, "LengthHistogram.png"), format='png', dpi=1000) | |
plt.close("all") | |
hist, bin_edges = np.histogram(lengths, bins='auto') | |
maxindex = np.argmax(hist) | |
return((bin_edges[maxindex], bin_edges[maxindex + 1])) | |
def getBin(fq, sizeRange): | |
''' Loop over the fastq file, extract sequence and quality scores in lists''' | |
return [(list(record.seq), list(record.letter_annotations["phred_quality"])) for record in SeqIO.parse(fq, "fastq") if sizeRange[0] < len(record) < sizeRange[1]] | |
def plotNucleotideDiversity(fqlists): | |
''' | |
Create a FastQC-like "Per base sequence content" plot | |
Plot fraction of nucleotides per position | |
zip will stop when shortest read is exhausted | |
''' | |
numreads = len(fqlists) | |
sns.set_style("darkgrid") | |
plt.plot(np.array([position.count('A') / numreads for position in zip(*fqlists)]), 'green') | |
plt.plot(np.array([position.count('T') / numreads for position in zip(*fqlists)]), 'red') | |
plt.plot(np.array([position.count('G') / numreads for position in zip(*fqlists)]), 'black') | |
plt.plot(np.array([position.count('C') / numreads for position in zip(*fqlists)]), 'blue') | |
plt.legend(['A', 'T', 'G', 'C'], loc='upper right') | |
plt.xlabel('Position in read') | |
plt.ylabel('Per base sequence content') | |
plt.savefig(os.path.join(args.outdir, "PerBaseSequenceContent.png"), format='png', dpi=1000) | |
plt.close("all") | |
def plotQual(quallist): | |
''' | |
Create a FastQC-like "Per base sequence quality" plot | |
Plot average quality per position | |
zip will stop when shortest read is exhausted | |
''' | |
sns.set_style("darkgrid") | |
plt.plot(np.array([np.mean(position) for position in zip(*quallist)]), 'red') | |
plt.xlabel('Position in read') | |
plt.ylabel('Per base sequence quality') | |
plt.savefig(os.path.join(args.outdir, "PerBaseSequenceQuality.png"), format='png', dpi=1000) | |
plt.close("all") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment