Skip to content

Instantly share code, notes, and snippets.

@Buttonwood
Last active March 29, 2016 14:20
Show Gist options
  • Save Buttonwood/a1267baab19a0c6a3b6b to your computer and use it in GitHub Desktop.
Save Buttonwood/a1267baab19a0c6a3b6b to your computer and use it in GitHub Desktop.
fastqc
// #=============================================================================
// # FileName: fastqc.cpp
// # Desc: A program for fastqc stat files;
// # Author: tanhao
// # Email: tanhao2013@gmail.com
// # HomePage: http://buttonwood.github.io
// # Version: 0.0.1
// # LastChange: 2014-04-10 16:27:55
// # History:
// #=============================================================================
// gcc -Wall fastqc.cpp -lstdc++ -lgzstream -lz -o fastqc
#include <cstdio>
#include <cstdlib>
#include <vector>
#include <string>
#include <iostream>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <gzstream.h>
#include <ctype.h>
using namespace std;
int main(int argc, char * const argv[])
{
int Q_SHIFT = 33;
int len = 100;
int c;
while ((c = getopt(argc, argv, "l:q:")) >= 0) {
switch (c) {
case 'q': Q_SHIFT = atoi(optarg); break;
case 'l': len = atoi(optarg); break;
}
}
if (optind == argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: fastqc [options] <in.fq1> \n\n");
fprintf(stderr, "Example: fastqc -l 100 -q 33 in.fq1.gz >in.fq1.stat\n\n");
fprintf(stderr, "Options: -q INT Quality value; Sanger/Illumia 1.8+: 33; Illumia 1.3+: 64; Default: [%d]\n", Q_SHIFT);
fprintf(stderr, " -l INT Read length; Default: [%d]\n", len);
fprintf(stderr, "\n");
return 1;
}
//fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
igzstream in(argv[optind]);
string rseq;
long **data;
data = new long*[len];
for(unsigned i = 0; i < len; ++i) {
data[i] = new long[47];
}
for(unsigned i = 0; i < len; ++i) {
for(unsigned j = 0; j < 47; ++j) {
data[i][j] = 0;
}
}
while (getline(in, rseq)){
getline(in, rseq, '\n');
for(unsigned i = 0; i < rseq.length(); ++i) {
if (rseq[i] == 'A' || rseq[i] == 'a')
{
data[i][0]++;
}
if (rseq[i] == 'T' || rseq[i] == 't')
{
data[i][1]++;
}
if (rseq[i] == 'G' || rseq[i] == 'g')
{
data[i][2]++;
}
if (rseq[i] == 'C' || rseq[i] == 'c')
{
data[i][3]++;
}
if (rseq[i] == 'N' || rseq[i] == 'n')
{
data[i][4]++;
}
}
getline(in, rseq, '\n');
getline(in, rseq, '\n');
for(unsigned i = 0; i < rseq.length(); ++i) {
//data[i][ord(rseq[i]-Q_SHIFT)]++;
int LOrdValue = toascii(rseq[i]) - Q_SHIFT + 4;
data[i][LOrdValue]++;
}
}
for (int i = 0; i < len; ++i)
{
for (int j = 0; j < 47; ++j)
{
cout << data[i][j] << "\t";
}
cout << endl;
}
for(unsigned i = 0; i < len; ++i) {
delete [] data[i];
}
delete [] data;
return 0;
}
// ************* Float like a butterfly! Stand like a buttonwood! *************
#=============================================================================
# FileName: fastqdup.py
# Desc: A short script for fastq duplication statistics.
# Author: tanhao
# Email: tanhao2013@gmail.com
# HomePage: http://buttonwood.github.io
# Version: 0.0.1
# LastChange: 2014-04-15 11:52:31
# History:
#=============================================================================
import argparse
import gzip
# def byLineReader(filename):
# if filename.endswith(".gz"):
# f = gzip.open(filename,'r')
# else:
# f = open(filename,'r')
# line = f.readline().rstrip("\n")
# while line:
# yield line
# line = f.readline().rstrip("\n")
# f.close()
# yield None
def main(args):
prefix = args.read1.split("\.")[0]
suffix = "clean.dup.gz"
if args.suffix:
suffix = args.suffix + ".dup.gz"
clean1 = prefix + ".R1." + suffix
clean2 = prefix + ".R2." + suffix
statis = prefix + ".dup.stat"
if not args.read1.endswith(".gz"):
a = open(args.read1,'r')
b = open(args.read2,'r')
else:
a = gzip.open(args.read1,'r')
b = gzip.open(args.read2,'r')
# a = byLineReader(args.read1)
# b = byLineReader(args.read2)
c = gzip.open(clean1,'w')
d = gzip.open(clean2,'w')
e = open(statis, "w")
dup = {}
num = 0
while 1:
num += 1
f1_tag = a.readline().rstrip("\n")
if not f1_tag:
break
f1_seq = a.readline().rstrip("\n")
a.readline().rstrip("\n")
f1_qua = a.readline().rstrip("\n")
f2_tag = b.readline().rstrip("\n")
f2_seq = b.readline().rstrip("\n")
b.readline().rstrip("\n")
f2_qua = b.readline().rstrip("\n")
pe_seq = f1_seq + f2_seq
kseq = hash(pe_seq)
if kseq in dup:
e.write(repr(dup[kseq]) + "\t" + repr(num) + "\n")
else:
dup[kseq] = num
c.write(f1_tag + "\n")
c.write(f1_seq + "\n")
c.write("+\n")
c.write(f1_qua + "\n")
d.write(f2_tag + "\n")
d.write(f2_seq + "\n")
d.write("+\n")
d.write(f2_qua + "\n")
for ifile in [a,b,c,d,e]:
ifile.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
formatter_class=argparse.RawTextHelpFormatter,
description="""
A script for fastq duplication statistics.
Input ->
A pair of reads, also support for gz files.
Output ->
Clean PE reads; and a stat file records for duplication
reads locations for checking up.
Examples ->
python fastqdup.py -r1 r1.fq -r2 r2.fq -s clean
""")
parser.add_argument(
"-r1", "--read1",
required=True,
help="input read1 file [must]")
parser.add_argument(
"-r2", "--read2",
required=True,
help="input read2 file [must]")
parser.add_argument(
"-s", "--suffix",
required=False,
help="output files suffix [Default:clean]")
args = parser.parse_args()
main(args)
#=============================================================================
# FileName: plot.R
# Desc: A R Script for Nucleotide Bases Statistics and Quality Heatmap.
# Author: tanhao
# Email: tanhao2013@gmail.com
# HomePage: http://buttonwood.github.io
# Version: 0.0.1
# LastChange: 2014-04-14 13:41:49
# History:
#=============================================================================
data <- read.table("Lac-pen-7_L2_I112.R1.clean.fastq_clean.fq.gz.stat");
base <- data[,1] + data[,2] + data[,3] + data[,4] + data[,5];
for (i in 1:length(data[1,]) ){
data[,i] = data[,i]*100/base;
}
pdf("out.pdf")
library("ggplot2")
library("reshape2")
library("scales")
# Nucleotide Bases Statistics.
d <- data.frame(x = rep(1:length(data[,1]), 5),
y = c(data[,1], data[,2], data[,3], data[,4], data[,5]),
Base = rep(c("A","T","C","G","N"), each=length(data[,1]))
)
p <- ggplot(d, aes(x, y, color = Base)) + geom_path()
p <- p + theme_bw() + xlab("Base Positions (bp)") + ylab("Content (%)")
P <- p + theme(legend.position = c(0.2, 0.88), legend.title = element_blank())
print(p)
# Quality Heatmap.
d <- data[,6:length(data[1,])]
d <- as.data.frame(d)
m <- dim(d)[1]
n <- dim(d)[2]
a <- as.matrix(d)
head(a)
rownames(a) <- c(1:m)
colnames(a) <- c(1:n)
head(a)
a <- melt(a)
head(a)
p <- ggplot(data = a, aes(x = Var1, y = Var2))
p <- p + geom_tile(aes(fill = value))
p <- p + scale_fill_gradientn(colours = c("white", "lightblue", "blue","darkblue", "lightblue", "lightgreen", "green", "darkgreen"),
values = rescale(c(0, 20, 40, 60, 80, 100)),
guide = "colorbar")
p <- p + xlab('Base Positions (bp)') + ylab("Quality value") + opts(title="Heatmap of Base Quality along Base Positions")
print(p)
# ************* Float like a butterfly! Stand like a buttonwood! *************
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment