Last active
March 29, 2016 14:20
-
-
Save Buttonwood/a1267baab19a0c6a3b6b to your computer and use it in GitHub Desktop.
fastqc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// #============================================================================= | |
// # FileName: fastqc.cpp | |
// # Desc: A program for fastqc stat files; | |
// # Author: tanhao | |
// # Email: tanhao2013@gmail.com | |
// # HomePage: http://buttonwood.github.io | |
// # Version: 0.0.1 | |
// # LastChange: 2014-04-10 16:27:55 | |
// # History: | |
// #============================================================================= | |
// gcc -Wall fastqc.cpp -lstdc++ -lgzstream -lz -o fastqc | |
#include <cstdio> | |
#include <cstdlib> | |
#include <vector> | |
#include <string> | |
#include <iostream> | |
#include <unistd.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <gzstream.h> | |
#include <ctype.h> | |
using namespace std; | |
int main(int argc, char * const argv[]) | |
{ | |
int Q_SHIFT = 33; | |
int len = 100; | |
int c; | |
while ((c = getopt(argc, argv, "l:q:")) >= 0) { | |
switch (c) { | |
case 'q': Q_SHIFT = atoi(optarg); break; | |
case 'l': len = atoi(optarg); break; | |
} | |
} | |
if (optind == argc) { | |
fprintf(stderr, "\n"); | |
fprintf(stderr, "Usage: fastqc [options] <in.fq1> \n\n"); | |
fprintf(stderr, "Example: fastqc -l 100 -q 33 in.fq1.gz >in.fq1.stat\n\n"); | |
fprintf(stderr, "Options: -q INT Quality value; Sanger/Illumia 1.8+: 33; Illumia 1.3+: 64; Default: [%d]\n", Q_SHIFT); | |
fprintf(stderr, " -l INT Read length; Default: [%d]\n", len); | |
fprintf(stderr, "\n"); | |
return 1; | |
} | |
//fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); | |
igzstream in(argv[optind]); | |
string rseq; | |
long **data; | |
data = new long*[len]; | |
for(unsigned i = 0; i < len; ++i) { | |
data[i] = new long[47]; | |
} | |
for(unsigned i = 0; i < len; ++i) { | |
for(unsigned j = 0; j < 47; ++j) { | |
data[i][j] = 0; | |
} | |
} | |
while (getline(in, rseq)){ | |
getline(in, rseq, '\n'); | |
for(unsigned i = 0; i < rseq.length(); ++i) { | |
if (rseq[i] == 'A' || rseq[i] == 'a') | |
{ | |
data[i][0]++; | |
} | |
if (rseq[i] == 'T' || rseq[i] == 't') | |
{ | |
data[i][1]++; | |
} | |
if (rseq[i] == 'G' || rseq[i] == 'g') | |
{ | |
data[i][2]++; | |
} | |
if (rseq[i] == 'C' || rseq[i] == 'c') | |
{ | |
data[i][3]++; | |
} | |
if (rseq[i] == 'N' || rseq[i] == 'n') | |
{ | |
data[i][4]++; | |
} | |
} | |
getline(in, rseq, '\n'); | |
getline(in, rseq, '\n'); | |
for(unsigned i = 0; i < rseq.length(); ++i) { | |
//data[i][ord(rseq[i]-Q_SHIFT)]++; | |
int LOrdValue = toascii(rseq[i]) - Q_SHIFT + 4; | |
data[i][LOrdValue]++; | |
} | |
} | |
for (int i = 0; i < len; ++i) | |
{ | |
for (int j = 0; j < 47; ++j) | |
{ | |
cout << data[i][j] << "\t"; | |
} | |
cout << endl; | |
} | |
for(unsigned i = 0; i < len; ++i) { | |
delete [] data[i]; | |
} | |
delete [] data; | |
return 0; | |
} | |
// ************* Float like a butterfly! Stand like a buttonwood! ************* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#============================================================================= | |
# FileName: fastqdup.py | |
# Desc: A short script for fastq duplication statistics. | |
# Author: tanhao | |
# Email: tanhao2013@gmail.com | |
# HomePage: http://buttonwood.github.io | |
# Version: 0.0.1 | |
# LastChange: 2014-04-15 11:52:31 | |
# History: | |
#============================================================================= | |
import argparse | |
import gzip | |
# def byLineReader(filename): | |
# if filename.endswith(".gz"): | |
# f = gzip.open(filename,'r') | |
# else: | |
# f = open(filename,'r') | |
# line = f.readline().rstrip("\n") | |
# while line: | |
# yield line | |
# line = f.readline().rstrip("\n") | |
# f.close() | |
# yield None | |
def main(args): | |
prefix = args.read1.split("\.")[0] | |
suffix = "clean.dup.gz" | |
if args.suffix: | |
suffix = args.suffix + ".dup.gz" | |
clean1 = prefix + ".R1." + suffix | |
clean2 = prefix + ".R2." + suffix | |
statis = prefix + ".dup.stat" | |
if not args.read1.endswith(".gz"): | |
a = open(args.read1,'r') | |
b = open(args.read2,'r') | |
else: | |
a = gzip.open(args.read1,'r') | |
b = gzip.open(args.read2,'r') | |
# a = byLineReader(args.read1) | |
# b = byLineReader(args.read2) | |
c = gzip.open(clean1,'w') | |
d = gzip.open(clean2,'w') | |
e = open(statis, "w") | |
dup = {} | |
num = 0 | |
while 1: | |
num += 1 | |
f1_tag = a.readline().rstrip("\n") | |
if not f1_tag: | |
break | |
f1_seq = a.readline().rstrip("\n") | |
a.readline().rstrip("\n") | |
f1_qua = a.readline().rstrip("\n") | |
f2_tag = b.readline().rstrip("\n") | |
f2_seq = b.readline().rstrip("\n") | |
b.readline().rstrip("\n") | |
f2_qua = b.readline().rstrip("\n") | |
pe_seq = f1_seq + f2_seq | |
kseq = hash(pe_seq) | |
if kseq in dup: | |
e.write(repr(dup[kseq]) + "\t" + repr(num) + "\n") | |
else: | |
dup[kseq] = num | |
c.write(f1_tag + "\n") | |
c.write(f1_seq + "\n") | |
c.write("+\n") | |
c.write(f1_qua + "\n") | |
d.write(f2_tag + "\n") | |
d.write(f2_seq + "\n") | |
d.write("+\n") | |
d.write(f2_qua + "\n") | |
for ifile in [a,b,c,d,e]: | |
ifile.close() | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser( | |
formatter_class=argparse.RawTextHelpFormatter, | |
description=""" | |
A script for fastq duplication statistics. | |
Input -> | |
A pair of reads, also support for gz files. | |
Output -> | |
Clean PE reads; and a stat file records for duplication | |
reads locations for checking up. | |
Examples -> | |
python fastqdup.py -r1 r1.fq -r2 r2.fq -s clean | |
""") | |
parser.add_argument( | |
"-r1", "--read1", | |
required=True, | |
help="input read1 file [must]") | |
parser.add_argument( | |
"-r2", "--read2", | |
required=True, | |
help="input read2 file [must]") | |
parser.add_argument( | |
"-s", "--suffix", | |
required=False, | |
help="output files suffix [Default:clean]") | |
args = parser.parse_args() | |
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#============================================================================= | |
# FileName: plot.R | |
# Desc: A R Script for Nucleotide Bases Statistics and Quality Heatmap. | |
# Author: tanhao | |
# Email: tanhao2013@gmail.com | |
# HomePage: http://buttonwood.github.io | |
# Version: 0.0.1 | |
# LastChange: 2014-04-14 13:41:49 | |
# History: | |
#============================================================================= | |
data <- read.table("Lac-pen-7_L2_I112.R1.clean.fastq_clean.fq.gz.stat"); | |
base <- data[,1] + data[,2] + data[,3] + data[,4] + data[,5]; | |
for (i in 1:length(data[1,]) ){ | |
data[,i] = data[,i]*100/base; | |
} | |
pdf("out.pdf") | |
library("ggplot2") | |
library("reshape2") | |
library("scales") | |
# Nucleotide Bases Statistics. | |
d <- data.frame(x = rep(1:length(data[,1]), 5), | |
y = c(data[,1], data[,2], data[,3], data[,4], data[,5]), | |
Base = rep(c("A","T","C","G","N"), each=length(data[,1])) | |
) | |
p <- ggplot(d, aes(x, y, color = Base)) + geom_path() | |
p <- p + theme_bw() + xlab("Base Positions (bp)") + ylab("Content (%)") | |
P <- p + theme(legend.position = c(0.2, 0.88), legend.title = element_blank()) | |
print(p) | |
# Quality Heatmap. | |
d <- data[,6:length(data[1,])] | |
d <- as.data.frame(d) | |
m <- dim(d)[1] | |
n <- dim(d)[2] | |
a <- as.matrix(d) | |
head(a) | |
rownames(a) <- c(1:m) | |
colnames(a) <- c(1:n) | |
head(a) | |
a <- melt(a) | |
head(a) | |
p <- ggplot(data = a, aes(x = Var1, y = Var2)) | |
p <- p + geom_tile(aes(fill = value)) | |
p <- p + scale_fill_gradientn(colours = c("white", "lightblue", "blue","darkblue", "lightblue", "lightgreen", "green", "darkgreen"), | |
values = rescale(c(0, 20, 40, 60, 80, 100)), | |
guide = "colorbar") | |
p <- p + xlab('Base Positions (bp)') + ylab("Quality value") + opts(title="Heatmap of Base Quality along Base Positions") | |
print(p) | |
# ************* Float like a butterfly! Stand like a buttonwood! ************* |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment