sklarz-bgu/fastqc_summary.R

## fastqc_summary.R
library("magrittr")
library("optparse")

args = commandArgs(trailingOnly=TRUE)

option_list = list(
    make_option(c("-d", "--dir"), type="character", default=NULL,
                help="Directory of fastqc results", metavar="character"),
    make_option(c("-o", "--outdir"), type="character", default=NULL,
                help="An existing dir in which to create the output files.", metavar="character")
);

opt_parser = optparse::OptionParser(usage = "usage: %prog [options]", option_list=option_list,epilogue="\n\nAuthor: Menachem Sklarz");
opt = optparse::parse_args(opt_parser);


if (is.null(opt$outdir)){
    opt$outdir <- dirname(normalizePath(opt$dir))
    exit(cat("No --outdir was passed!"))
}


out.dir <- opt$outdir
zipdir  <- opt$dir

out.summary <- sprintf("%s/%s",out.dir,"fastqc_summary.txt")
out.basic   <- sprintf("%s/%s",out.dir,"fastqc_BasicStatistics.txt")

temp.summary <- sprintf("%s/%s",out.dir,"summary.txt")
temp.stats   <- sprintf("%s/%s",out.dir,"fastqc_data.txt")


# Get list of zip files:
zipf_list <- dir(zipdir,recursive = T)      %>%
                grep(pattern="zip",value=T) %>%
                paste(zipdir,.,sep="")

cat("Summarizing tests...\n")
# Extract and summarize test statistics:
for (i in 1:length(zipf_list)) {
    zipf = zipf_list[i]
    cat(sprintf("   File... %s\n",zipf))
    # List files in a zipped file:
    archive = unzip(zipf,list=T)[,1]  %>%
                grep(pattern="summary.txt",value=T)
    unzip(zipf,
          files     = archive,
          junkpaths = T,
          exdir     = out.dir)
    fastqc_table = read.delim(temp.summary,
                              header           = F,
                              stringsAsFactors = F)
    names(fastqc_table) <- c(fastqc_table[1,3],"test")
    fastqc_table <- fastqc_table[,-3]
    if (i==1) {
        final_fqc_table = fastqc_table
    } else {
        final_fqc_table <- merge(final_fqc_table, fastqc_table, by="test")
    }
}

final_fqc_table <- t(final_fqc_table)
colnames(final_fqc_table) <- final_fqc_table[1,]
final_fqc_table <- final_fqc_table[-1,]

write.table(file = out.summary, x = final_fqc_table,quote=F,row.names = T, sep="\t")

cat("Summarizing basic stats...\n")
# Extract and summarize test statistics:
for (i in 1:length(zipf_list)) {
    zipf = zipf_list[i]
    cat(sprintf("   File... %s\n",zipf))
    # List files in a zipped file:
    archive = unzip(zipf,list=T)[,1]  %>%
                grep(pattern="fastqc_data.txt",value=T)
    unzip(zipf,
          files     = archive,
          junkpaths = T,
          exdir     = out.dir)
    fastqc_table = read.delim(temp.stats,
                              header = F,
                              stringsAsFactors = F)
    names(fastqc_table) <- c("param","value")
    # Get range of indeices containng the size distribution data:
    size_dist_range = (which((fastqc_table$param==">>Sequence Length Distribution"))+1):
        (which(fastqc_table$param==">>END_MODULE")[min(which(which(fastqc_table$param==">>END_MODULE")>
                                                                 which(fastqc_table$param==">>Sequence Length Distribution")))]-1)
    # Get the actual part of the table:
    size_dist_table <- fastqc_table[size_dist_range,]
    # Get the max part of the bin:
    # if size_dist_table is of length 1 (w/o header), then all seqs are same length. Treat separately:
    if (length(size_dist_range)>2){
        bins = lapply(X = strsplit(size_dist_table$param[-1],split = "-",fixed = T),FUN = function(x) x[2]) %>% unlist() %>% as.numeric()
        # Get the value:
        vals = size_dist_table$value[-1] %>% as.numeric
        # Agglomerate to nearest 50bp (may be problem with bin edges...)
        size_dist_table <- tapply(X=vals, INDEX=as.factor(50-bins%%50+bins), FUN=sum)
        size_dist_table <- data.frame(param=names(size_dist_table),value=size_dist_table)
        size_dist_table$param <- sprintf("L%03d-%03d",as.numeric(as.character(size_dist_table$param))-49,as.numeric(as.character(size_dist_table$param)))
    } else {
        bins = size_dist_table$param[-1] %>% as.numeric()
        # Get the value:
        vals = size_dist_table$value[-1] %>% as.numeric
        # Agglomerate to nearest 50bp (may be problem with bin edges...)
        size_dist_table <- data.frame(param=paste("L",bins,sep=""),value=vals)
    }


    fastqc_table <- fastqc_table[fastqc_table$param %in%
                                     c("Filename","File type","Encoding",
                                       "Total Sequences","Sequences flagged as poor quality",
                                       "Sequence length","%GC"),]
    names(size_dist_table) <- c("param",fastqc_table$value[fastqc_table$param=="Filename"])
    names(fastqc_table) <- c("param",fastqc_table$value[fastqc_table$param=="Filename"])

    fastqc_table <- rbind(fastqc_table,size_dist_table)
    # Get length distribution lines -> table
    # Apply -> bins of length 50bp
    # Add bins to end of column
    if (i==1) {
        final_fqc_table = fastqc_table
    } else {
        final_fqc_table <- merge(final_fqc_table, fastqc_table, by="param",all = T)
    }
}

# Transpose:
final_fqc_table <- t(final_fqc_table)
# Column headers are in first row:
colnames(final_fqc_table) <- final_fqc_table[1,]
final_fqc_table <- final_fqc_table[-1,]
# Move Filename column to first column:
t1 <- which(colnames(final_fqc_table) == "Filename")
final_fqc_table <- final_fqc_table[,c(t1,setdiff(1:dim(final_fqc_table)[2],t1))]
# Write output
write.table(file = out.basic,x = final_fqc_table,quote=F,row.names = F, sep="\t")

# Remove temporary files:
file.remove(temp.stats)
file.remove(temp.summary)
	library("magrittr")
	library("optparse")

	args = commandArgs(trailingOnly=TRUE)

	option_list = list(
	make_option(c("-d", "--dir"), type="character", default=NULL,
	help="Directory of fastqc results", metavar="character"),
	make_option(c("-o", "--outdir"), type="character", default=NULL,
	help="An existing dir in which to create the output files.", metavar="character")
	);

	opt_parser = optparse::OptionParser(usage = "usage: %prog [options]", option_list=option_list,epilogue="\n\nAuthor: Menachem Sklarz");
	opt = optparse::parse_args(opt_parser);


	if (is.null(opt$outdir)){
	opt$outdir <- dirname(normalizePath(opt$dir))
	exit(cat("No --outdir was passed!"))
	}


	out.dir <- opt$outdir
	zipdir <- opt$dir

	out.summary <- sprintf("%s/%s",out.dir,"fastqc_summary.txt")
	out.basic <- sprintf("%s/%s",out.dir,"fastqc_BasicStatistics.txt")

	temp.summary <- sprintf("%s/%s",out.dir,"summary.txt")
	temp.stats <- sprintf("%s/%s",out.dir,"fastqc_data.txt")


	# Get list of zip files:
	zipf_list <- dir(zipdir,recursive = T) %>%
	grep(pattern="zip",value=T) %>%
	paste(zipdir,.,sep="")

	cat("Summarizing tests...\n")
	# Extract and summarize test statistics:
	for (i in 1:length(zipf_list)) {
	zipf = zipf_list[i]
	cat(sprintf(" File... %s\n",zipf))
	# List files in a zipped file:
	archive = unzip(zipf,list=T)[,1] %>%
	grep(pattern="summary.txt",value=T)
	unzip(zipf,
	files = archive,
	junkpaths = T,
	exdir = out.dir)
	fastqc_table = read.delim(temp.summary,
	header = F,
	stringsAsFactors = F)
	names(fastqc_table) <- c(fastqc_table[1,3],"test")
	fastqc_table <- fastqc_table[,-3]
	if (i==1) {
	final_fqc_table = fastqc_table
	} else {
	final_fqc_table <- merge(final_fqc_table, fastqc_table, by="test")
	}
	}

	final_fqc_table <- t(final_fqc_table)
	colnames(final_fqc_table) <- final_fqc_table[1,]
	final_fqc_table <- final_fqc_table[-1,]

	write.table(file = out.summary, x = final_fqc_table,quote=F,row.names = T, sep="\t")

	cat("Summarizing basic stats...\n")
	# Extract and summarize test statistics:
	for (i in 1:length(zipf_list)) {
	zipf = zipf_list[i]
	cat(sprintf(" File... %s\n",zipf))
	# List files in a zipped file:
	archive = unzip(zipf,list=T)[,1] %>%
	grep(pattern="fastqc_data.txt",value=T)
	unzip(zipf,
	files = archive,
	junkpaths = T,
	exdir = out.dir)
	fastqc_table = read.delim(temp.stats,
	header = F,
	stringsAsFactors = F)
	names(fastqc_table) <- c("param","value")
	# Get range of indeices containng the size distribution data:
	size_dist_range = (which((fastqc_table$param==">>Sequence Length Distribution"))+1):
	(which(fastqc_table$param==">>END_MODULE")[min(which(which(fastqc_table$param==">>END_MODULE")>
	which(fastqc_table$param==">>Sequence Length Distribution")))]-1)
	# Get the actual part of the table:
	size_dist_table <- fastqc_table[size_dist_range,]
	# Get the max part of the bin:
	# if size_dist_table is of length 1 (w/o header), then all seqs are same length. Treat separately:
	if (length(size_dist_range)>2){
	bins = lapply(X = strsplit(size_dist_table$param[-1],split = "-",fixed = T),FUN = function(x) x[2]) %>% unlist() %>% as.numeric()
	# Get the value:
	vals = size_dist_table$value[-1] %>% as.numeric
	# Agglomerate to nearest 50bp (may be problem with bin edges...)
	size_dist_table <- tapply(X=vals, INDEX=as.factor(50-bins%%50+bins), FUN=sum)
	size_dist_table <- data.frame(param=names(size_dist_table),value=size_dist_table)
	size_dist_table$param <- sprintf("L%03d-%03d",as.numeric(as.character(size_dist_table$param))-49,as.numeric(as.character(size_dist_table$param)))
	} else {
	bins = size_dist_table$param[-1] %>% as.numeric()
	# Get the value:
	vals = size_dist_table$value[-1] %>% as.numeric
	# Agglomerate to nearest 50bp (may be problem with bin edges...)
	size_dist_table <- data.frame(param=paste("L",bins,sep=""),value=vals)
	}


	fastqc_table <- fastqc_table[fastqc_table$param %in%
	c("Filename","File type","Encoding",
	"Total Sequences","Sequences flagged as poor quality",
	"Sequence length","%GC"),]
	names(size_dist_table) <- c("param",fastqc_table$value[fastqc_table$param=="Filename"])
	names(fastqc_table) <- c("param",fastqc_table$value[fastqc_table$param=="Filename"])

	fastqc_table <- rbind(fastqc_table,size_dist_table)
	# Get length distribution lines -> table
	# Apply -> bins of length 50bp
	# Add bins to end of column
	if (i==1) {
	final_fqc_table = fastqc_table
	} else {
	final_fqc_table <- merge(final_fqc_table, fastqc_table, by="param",all = T)
	}
	}

	# Transpose:
	final_fqc_table <- t(final_fqc_table)
	# Column headers are in first row:
	colnames(final_fqc_table) <- final_fqc_table[1,]
	final_fqc_table <- final_fqc_table[-1,]
	# Move Filename column to first column:
	t1 <- which(colnames(final_fqc_table) == "Filename")
	final_fqc_table <- final_fqc_table[,c(t1,setdiff(1:dim(final_fqc_table)[2],t1))]
	# Write output
	write.table(file = out.basic,x = final_fqc_table,quote=F,row.names = F, sep="\t")

	# Remove temporary files:
	file.remove(temp.stats)
	file.remove(temp.summary)