Buttonwood/fastqc.cpp

## fastqc.cpp
// #=============================================================================
// #     FileName: fastqc.cpp
// #         Desc: A program for fastqc stat files;
// #       Author: tanhao
// #        Email: tanhao2013@gmail.com
// #     HomePage: http://buttonwood.github.io
// #      Version: 0.0.1
// #   LastChange: 2014-04-10 16:27:55
// #      History:
// #=============================================================================

// gcc -Wall fastqc.cpp -lstdc++ -lgzstream -lz -o fastqc

#include <cstdio>
#include <cstdlib>
#include <vector>
#include <string>
#include <iostream>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <gzstream.h>
#include <ctype.h>

using namespace std;


int main(int argc, char * const argv[])
{
	int Q_SHIFT = 33;
	int len = 100;
	int c;
	while ((c = getopt(argc, argv, "l:q:")) >= 0) {
		switch (c) {
			case 'q': Q_SHIFT = atoi(optarg); break;
			case 'l': len = atoi(optarg); break;
		}
	}

	if (optind == argc) {
		fprintf(stderr, "\n");
		fprintf(stderr, "Usage: fastqc [options] <in.fq1> \n\n");
		fprintf(stderr, "Example: fastqc -l 100 -q 33 in.fq1.gz >in.fq1.stat\n\n");
		fprintf(stderr, "Options: -q INT Quality value; Sanger/Illumia 1.8+: 33; Illumia 1.3+: 64; Default: [%d]\n", Q_SHIFT);
		fprintf(stderr, "         -l INT Read length; Default: [%d]\n", len);
		fprintf(stderr, "\n");
		return 1;
	}

	//fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
	igzstream in(argv[optind]);

    string rseq;

    long **data;
    data = new long*[len];
    for(unsigned i = 0; i < len; ++i) {
    	data[i] = new long[47];
    }

    for(unsigned i = 0; i < len; ++i) {
    	for(unsigned j = 0; j < 47; ++j) {
    		data[i][j] = 0;
    	}
    }

    while (getline(in, rseq)){
        getline(in, rseq, '\n');
        for(unsigned i = 0; i < rseq.length(); ++i) {
        	if (rseq[i] == 'A' || rseq[i] == 'a')
        	{
        		data[i][0]++;
        	}
        	if (rseq[i] == 'T' || rseq[i] == 't')
        	{
        		data[i][1]++;
        	}
        	if (rseq[i] == 'G' || rseq[i] == 'g')
        	{
        		data[i][2]++;
        	}
        	if (rseq[i] == 'C' || rseq[i] == 'c')
        	{
        		data[i][3]++;
        	}
        	if (rseq[i] == 'N' || rseq[i] == 'n')
        	{
        		data[i][4]++;
        	}
        }
       	getline(in, rseq, '\n');
        getline(in, rseq, '\n');
        for(unsigned i = 0; i < rseq.length(); ++i) {
        	//data[i][ord(rseq[i]-Q_SHIFT)]++;
        	int LOrdValue = toascii(rseq[i]) - Q_SHIFT + 4;
        	data[i][LOrdValue]++;
        }
    }

    for (int i = 0; i < len; ++i)
    {
    	for (int j = 0; j < 47; ++j)
    	{
    		cout << data[i][j] << "\t";
    	}
    	cout << endl;
    }

    for(unsigned i = 0; i < len; ++i) {
    	delete [] data[i];
    }
    delete [] data;

	return 0;
}

// *************  Float like a butterfly! Stand like a buttonwood!  *************

## fastqdup.py
#=============================================================================
#     FileName: fastqdup.py
#         Desc: A short script for fastq duplication statistics.
#       Author: tanhao
#        Email: tanhao2013@gmail.com
#     HomePage: http://buttonwood.github.io
#      Version: 0.0.1
#   LastChange: 2014-04-15 11:52:31
#      History:
#=============================================================================

import argparse
import gzip

# def byLineReader(filename):
# 	if filename.endswith(".gz"):
# 		f = gzip.open(filename,'r')
# 	else:
# 		f = open(filename,'r')
# 	line = f.readline().rstrip("\n")
# 	while line:
# 		yield line
# 		line = f.readline().rstrip("\n")
# 	f.close()
# 	yield None

def main(args):
	prefix = args.read1.split("\.")[0]
	suffix = "clean.dup.gz"
	if args.suffix:
		suffix = args.suffix + ".dup.gz"
	clean1 = prefix + ".R1." + suffix
	clean2 = prefix + ".R2." + suffix
	statis = prefix + ".dup.stat"

	if not args.read1.endswith(".gz"):
		a = open(args.read1,'r')
		b = open(args.read2,'r')
	else:
		a = gzip.open(args.read1,'r')
		b = gzip.open(args.read2,'r')
	# a = byLineReader(args.read1)
	# b = byLineReader(args.read2)

	c = gzip.open(clean1,'w')
	d = gzip.open(clean2,'w')
	e = open(statis, "w")

	dup = {}
	num = 0
	while 1:
		num   += 1
		f1_tag = a.readline().rstrip("\n")
		if not f1_tag:
			break
		f1_seq = a.readline().rstrip("\n")
		a.readline().rstrip("\n")
		f1_qua = a.readline().rstrip("\n")

		f2_tag = b.readline().rstrip("\n")
		f2_seq = b.readline().rstrip("\n")
		b.readline().rstrip("\n")
		f2_qua = b.readline().rstrip("\n")

		pe_seq = f1_seq + f2_seq
		kseq   = hash(pe_seq)

		if kseq in dup:
			e.write(repr(dup[kseq]) + "\t" + repr(num) + "\n")
		else:
			dup[kseq] = num
			c.write(f1_tag + "\n")
			c.write(f1_seq + "\n")
			c.write("+\n")
			c.write(f1_qua + "\n")

			d.write(f2_tag + "\n")
			d.write(f2_seq + "\n")
			d.write("+\n")
			d.write(f2_qua + "\n")

	for ifile in [a,b,c,d,e]:
		ifile.close()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawTextHelpFormatter,
        description="""
A script for fastq duplication statistics.

Input   ->
	A pair of reads, also support for gz files.

Output  ->
	Clean PE reads; and a stat file records for duplication
	reads locations for checking up.

Examples ->
	python fastqdup.py -r1 r1.fq -r2 r2.fq -s clean
        """)
    parser.add_argument(
        "-r1", "--read1",
        required=True,
        help="input read1 file [must]")
    parser.add_argument(
        "-r2", "--read2",
        required=True,
        help="input read2 file [must]")
    parser.add_argument(
        "-s", "--suffix",
        required=False,
        help="output files suffix [Default:clean]")
    args = parser.parse_args()
    main(args)

## plot.R
#=============================================================================
#     FileName: plot.R
#         Desc: A R Script for Nucleotide Bases Statistics and Quality Heatmap.
#       Author: tanhao
#        Email: tanhao2013@gmail.com
#     HomePage: http://buttonwood.github.io
#      Version: 0.0.1
#   LastChange: 2014-04-14 13:41:49
#      History:
#=============================================================================

data <- read.table("Lac-pen-7_L2_I112.R1.clean.fastq_clean.fq.gz.stat");

base <- data[,1] + data[,2] + data[,3] + data[,4] + data[,5];

for (i in 1:length(data[1,]) ){
    data[,i] = data[,i]*100/base;
}

pdf("out.pdf")
library("ggplot2")
library("reshape2")
library("scales")


# Nucleotide Bases Statistics.
d <- data.frame(x = rep(1:length(data[,1]), 5),
                y = c(data[,1], data[,2], data[,3], data[,4], data[,5]),
                Base = rep(c("A","T","C","G","N"), each=length(data[,1]))
)

p <- ggplot(d, aes(x, y, color = Base)) + geom_path()
p <- p + theme_bw() + xlab("Base Positions (bp)") + ylab("Content (%)")
P <- p + theme(legend.position = c(0.2, 0.88), legend.title = element_blank())
print(p)


# Quality Heatmap.
d <- data[,6:length(data[1,])]
d <- as.data.frame(d)
m <- dim(d)[1]
n <- dim(d)[2]

a <- as.matrix(d)
head(a)
rownames(a) <- c(1:m)
colnames(a) <- c(1:n)
head(a)
a <- melt(a)
head(a)
p <- ggplot(data = a, aes(x = Var1, y = Var2))
p <- p + geom_tile(aes(fill = value))
p <- p + scale_fill_gradientn(colours = c("white", "lightblue", "blue","darkblue", "lightblue", "lightgreen", "green", "darkgreen"),
                              values  = rescale(c(0, 20, 40, 60, 80, 100)),
                              guide   = "colorbar")
p <- p + xlab('Base Positions (bp)') + ylab("Quality value") + opts(title="Heatmap of Base Quality along Base Positions")
print(p)


# *************  Float like a butterfly! Stand like a buttonwood!  *************
	// #=============================================================================
	// # FileName: fastqc.cpp
	// # Desc: A program for fastqc stat files;
	// # Author: tanhao
	// # Email: tanhao2013@gmail.com
	// # HomePage: http://buttonwood.github.io
	// # Version: 0.0.1
	// # LastChange: 2014-04-10 16:27:55
	// # History:
	// #=============================================================================

	// gcc -Wall fastqc.cpp -lstdc++ -lgzstream -lz -o fastqc

	#include <cstdio>
	#include <cstdlib>
	#include <vector>
	#include <string>
	#include <iostream>
	#include <unistd.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <gzstream.h>
	#include <ctype.h>

	using namespace std;


	int main(int argc, char * const argv[])
	{
	int Q_SHIFT = 33;
	int len = 100;
	int c;
	while ((c = getopt(argc, argv, "l:q:")) >= 0) {
	switch (c) {
	case 'q': Q_SHIFT = atoi(optarg); break;
	case 'l': len = atoi(optarg); break;
	}
	}

	if (optind == argc) {
	fprintf(stderr, "\n");
	fprintf(stderr, "Usage: fastqc [options] <in.fq1> \n\n");
	fprintf(stderr, "Example: fastqc -l 100 -q 33 in.fq1.gz >in.fq1.stat\n\n");
	fprintf(stderr, "Options: -q INT Quality value; Sanger/Illumia 1.8+: 33; Illumia 1.3+: 64; Default: [%d]\n", Q_SHIFT);
	fprintf(stderr, " -l INT Read length; Default: [%d]\n", len);
	fprintf(stderr, "\n");
	return 1;
	}

	//fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
	igzstream in(argv[optind]);

	string rseq;

	long **data;
	data = new long*[len];
	for(unsigned i = 0; i < len; ++i) {
	data[i] = new long[47];
	}

	for(unsigned i = 0; i < len; ++i) {
	for(unsigned j = 0; j < 47; ++j) {
	data[i][j] = 0;
	}
	}

	while (getline(in, rseq)){
	getline(in, rseq, '\n');
	for(unsigned i = 0; i < rseq.length(); ++i) {
	if (rseq[i] == 'A' \|\| rseq[i] == 'a')
	{
	data[i][0]++;
	}
	if (rseq[i] == 'T' \|\| rseq[i] == 't')
	{
	data[i][1]++;
	}
	if (rseq[i] == 'G' \|\| rseq[i] == 'g')
	{
	data[i][2]++;
	}
	if (rseq[i] == 'C' \|\| rseq[i] == 'c')
	{
	data[i][3]++;
	}
	if (rseq[i] == 'N' \|\| rseq[i] == 'n')
	{
	data[i][4]++;
	}
	}
	getline(in, rseq, '\n');
	getline(in, rseq, '\n');
	for(unsigned i = 0; i < rseq.length(); ++i) {
	//data[i][ord(rseq[i]-Q_SHIFT)]++;
	int LOrdValue = toascii(rseq[i]) - Q_SHIFT + 4;
	data[i][LOrdValue]++;
	}
	}

	for (int i = 0; i < len; ++i)
	{
	for (int j = 0; j < 47; ++j)
	{
	cout << data[i][j] << "\t";
	}
	cout << endl;
	}

	for(unsigned i = 0; i < len; ++i) {
	delete [] data[i];
	}
	delete [] data;

	return 0;
	}

	// *********** Float like a butterfly! Stand like a buttonwood! ***********
	#=============================================================================
	# FileName: fastqdup.py
	# Desc: A short script for fastq duplication statistics.
	# Author: tanhao
	# Email: tanhao2013@gmail.com
	# HomePage: http://buttonwood.github.io
	# Version: 0.0.1
	# LastChange: 2014-04-15 11:52:31
	# History:
	#=============================================================================

	import argparse
	import gzip

	# def byLineReader(filename):
	# if filename.endswith(".gz"):
	# f = gzip.open(filename,'r')
	# else:
	# f = open(filename,'r')
	# line = f.readline().rstrip("\n")
	# while line:
	# yield line
	# line = f.readline().rstrip("\n")
	# f.close()
	# yield None

	def main(args):
	prefix = args.read1.split("\.")[0]
	suffix = "clean.dup.gz"
	if args.suffix:
	suffix = args.suffix + ".dup.gz"
	clean1 = prefix + ".R1." + suffix
	clean2 = prefix + ".R2." + suffix
	statis = prefix + ".dup.stat"

	if not args.read1.endswith(".gz"):
	a = open(args.read1,'r')
	b = open(args.read2,'r')
	else:
	a = gzip.open(args.read1,'r')
	b = gzip.open(args.read2,'r')
	# a = byLineReader(args.read1)
	# b = byLineReader(args.read2)

	c = gzip.open(clean1,'w')
	d = gzip.open(clean2,'w')
	e = open(statis, "w")

	dup = {}
	num = 0
	while 1:
	num += 1
	f1_tag = a.readline().rstrip("\n")
	if not f1_tag:
	break
	f1_seq = a.readline().rstrip("\n")
	a.readline().rstrip("\n")
	f1_qua = a.readline().rstrip("\n")

	f2_tag = b.readline().rstrip("\n")
	f2_seq = b.readline().rstrip("\n")
	b.readline().rstrip("\n")
	f2_qua = b.readline().rstrip("\n")

	pe_seq = f1_seq + f2_seq
	kseq = hash(pe_seq)

	if kseq in dup:
	e.write(repr(dup[kseq]) + "\t" + repr(num) + "\n")
	else:
	dup[kseq] = num
	c.write(f1_tag + "\n")
	c.write(f1_seq + "\n")
	c.write("+\n")
	c.write(f1_qua + "\n")

	d.write(f2_tag + "\n")
	d.write(f2_seq + "\n")
	d.write("+\n")
	d.write(f2_qua + "\n")

	for ifile in [a,b,c,d,e]:
	ifile.close()


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(
	formatter_class=argparse.RawTextHelpFormatter,
	description="""
	A script for fastq duplication statistics.

	Input ->
	A pair of reads, also support for gz files.

	Output ->
	Clean PE reads; and a stat file records for duplication
	reads locations for checking up.

	Examples ->
	python fastqdup.py -r1 r1.fq -r2 r2.fq -s clean
	""")
	parser.add_argument(
	"-r1", "--read1",
	required=True,
	help="input read1 file [must]")
	parser.add_argument(
	"-r2", "--read2",
	required=True,
	help="input read2 file [must]")
	parser.add_argument(
	"-s", "--suffix",
	required=False,
	help="output files suffix [Default:clean]")
	args = parser.parse_args()
	main(args)
	#=============================================================================
	# FileName: plot.R
	# Desc: A R Script for Nucleotide Bases Statistics and Quality Heatmap.
	# Author: tanhao
	# Email: tanhao2013@gmail.com
	# HomePage: http://buttonwood.github.io
	# Version: 0.0.1
	# LastChange: 2014-04-14 13:41:49
	# History:
	#=============================================================================

	data <- read.table("Lac-pen-7_L2_I112.R1.clean.fastq_clean.fq.gz.stat");

	base <- data[,1] + data[,2] + data[,3] + data[,4] + data[,5];

	for (i in 1:length(data[1,]) ){
	data[,i] = data[,i]*100/base;
	}

	pdf("out.pdf")
	library("ggplot2")
	library("reshape2")
	library("scales")


	# Nucleotide Bases Statistics.
	d <- data.frame(x = rep(1:length(data[,1]), 5),
	y = c(data[,1], data[,2], data[,3], data[,4], data[,5]),
	Base = rep(c("A","T","C","G","N"), each=length(data[,1]))
	)

	p <- ggplot(d, aes(x, y, color = Base)) + geom_path()
	p <- p + theme_bw() + xlab("Base Positions (bp)") + ylab("Content (%)")
	P <- p + theme(legend.position = c(0.2, 0.88), legend.title = element_blank())
	print(p)


	# Quality Heatmap.
	d <- data[,6:length(data[1,])]
	d <- as.data.frame(d)
	m <- dim(d)[1]
	n <- dim(d)[2]

	a <- as.matrix(d)
	head(a)
	rownames(a) <- c(1:m)
	colnames(a) <- c(1:n)
	head(a)
	a <- melt(a)
	head(a)
	p <- ggplot(data = a, aes(x = Var1, y = Var2))
	p <- p + geom_tile(aes(fill = value))
	p <- p + scale_fill_gradientn(colours = c("white", "lightblue", "blue","darkblue", "lightblue", "lightgreen", "green", "darkgreen"),
	values = rescale(c(0, 20, 40, 60, 80, 100)),
	guide = "colorbar")
	p <- p + xlab('Base Positions (bp)') + ylab("Quality value") + opts(title="Heatmap of Base Quality along Base Positions")
	print(p)


	# *********** Float like a butterfly! Stand like a buttonwood! ***********