Created
February 8, 2017 11:14
-
-
Save sklarz-bgu/33f494f6f0bfe12d782e131b5aa9cf4a to your computer and use it in GitHub Desktop.
Return tabular version of fastqc results for multiple samples.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library("magrittr") | |
library("optparse") | |
args = commandArgs(trailingOnly=TRUE) | |
option_list = list( | |
make_option(c("-d", "--dir"), type="character", default=NULL, | |
help="Directory of fastqc results", metavar="character"), | |
make_option(c("-o", "--outdir"), type="character", default=NULL, | |
help="An existing dir in which to create the output files.", metavar="character") | |
); | |
opt_parser = optparse::OptionParser(usage = "usage: %prog [options]", option_list=option_list,epilogue="\n\nAuthor: Menachem Sklarz"); | |
opt = optparse::parse_args(opt_parser); | |
if (is.null(opt$outdir)){ | |
opt$outdir <- dirname(normalizePath(opt$dir)) | |
exit(cat("No --outdir was passed!")) | |
} | |
out.dir <- opt$outdir | |
zipdir <- opt$dir | |
out.summary <- sprintf("%s/%s",out.dir,"fastqc_summary.txt") | |
out.basic <- sprintf("%s/%s",out.dir,"fastqc_BasicStatistics.txt") | |
temp.summary <- sprintf("%s/%s",out.dir,"summary.txt") | |
temp.stats <- sprintf("%s/%s",out.dir,"fastqc_data.txt") | |
# Get list of zip files: | |
zipf_list <- dir(zipdir,recursive = T) %>% | |
grep(pattern="zip",value=T) %>% | |
paste(zipdir,.,sep="") | |
cat("Summarizing tests...\n") | |
# Extract and summarize test statistics: | |
for (i in 1:length(zipf_list)) { | |
zipf = zipf_list[i] | |
cat(sprintf(" File... %s\n",zipf)) | |
# List files in a zipped file: | |
archive = unzip(zipf,list=T)[,1] %>% | |
grep(pattern="summary.txt",value=T) | |
unzip(zipf, | |
files = archive, | |
junkpaths = T, | |
exdir = out.dir) | |
fastqc_table = read.delim(temp.summary, | |
header = F, | |
stringsAsFactors = F) | |
names(fastqc_table) <- c(fastqc_table[1,3],"test") | |
fastqc_table <- fastqc_table[,-3] | |
if (i==1) { | |
final_fqc_table = fastqc_table | |
} else { | |
final_fqc_table <- merge(final_fqc_table, fastqc_table, by="test") | |
} | |
} | |
final_fqc_table <- t(final_fqc_table) | |
colnames(final_fqc_table) <- final_fqc_table[1,] | |
final_fqc_table <- final_fqc_table[-1,] | |
write.table(file = out.summary, x = final_fqc_table,quote=F,row.names = T, sep="\t") | |
cat("Summarizing basic stats...\n") | |
# Extract and summarize test statistics: | |
for (i in 1:length(zipf_list)) { | |
zipf = zipf_list[i] | |
cat(sprintf(" File... %s\n",zipf)) | |
# List files in a zipped file: | |
archive = unzip(zipf,list=T)[,1] %>% | |
grep(pattern="fastqc_data.txt",value=T) | |
unzip(zipf, | |
files = archive, | |
junkpaths = T, | |
exdir = out.dir) | |
fastqc_table = read.delim(temp.stats, | |
header = F, | |
stringsAsFactors = F) | |
names(fastqc_table) <- c("param","value") | |
# Get range of indeices containng the size distribution data: | |
size_dist_range = (which((fastqc_table$param==">>Sequence Length Distribution"))+1): | |
(which(fastqc_table$param==">>END_MODULE")[min(which(which(fastqc_table$param==">>END_MODULE")> | |
which(fastqc_table$param==">>Sequence Length Distribution")))]-1) | |
# Get the actual part of the table: | |
size_dist_table <- fastqc_table[size_dist_range,] | |
# Get the max part of the bin: | |
# if size_dist_table is of length 1 (w/o header), then all seqs are same length. Treat separately: | |
if (length(size_dist_range)>2){ | |
bins = lapply(X = strsplit(size_dist_table$param[-1],split = "-",fixed = T),FUN = function(x) x[2]) %>% unlist() %>% as.numeric() | |
# Get the value: | |
vals = size_dist_table$value[-1] %>% as.numeric | |
# Agglomerate to nearest 50bp (may be problem with bin edges...) | |
size_dist_table <- tapply(X=vals, INDEX=as.factor(50-bins%%50+bins), FUN=sum) | |
size_dist_table <- data.frame(param=names(size_dist_table),value=size_dist_table) | |
size_dist_table$param <- sprintf("L%03d-%03d",as.numeric(as.character(size_dist_table$param))-49,as.numeric(as.character(size_dist_table$param))) | |
} else { | |
bins = size_dist_table$param[-1] %>% as.numeric() | |
# Get the value: | |
vals = size_dist_table$value[-1] %>% as.numeric | |
# Agglomerate to nearest 50bp (may be problem with bin edges...) | |
size_dist_table <- data.frame(param=paste("L",bins,sep=""),value=vals) | |
} | |
fastqc_table <- fastqc_table[fastqc_table$param %in% | |
c("Filename","File type","Encoding", | |
"Total Sequences","Sequences flagged as poor quality", | |
"Sequence length","%GC"),] | |
names(size_dist_table) <- c("param",fastqc_table$value[fastqc_table$param=="Filename"]) | |
names(fastqc_table) <- c("param",fastqc_table$value[fastqc_table$param=="Filename"]) | |
fastqc_table <- rbind(fastqc_table,size_dist_table) | |
# Get length distribution lines -> table | |
# Apply -> bins of length 50bp | |
# Add bins to end of column | |
if (i==1) { | |
final_fqc_table = fastqc_table | |
} else { | |
final_fqc_table <- merge(final_fqc_table, fastqc_table, by="param",all = T) | |
} | |
} | |
# Transpose: | |
final_fqc_table <- t(final_fqc_table) | |
# Column headers are in first row: | |
colnames(final_fqc_table) <- final_fqc_table[1,] | |
final_fqc_table <- final_fqc_table[-1,] | |
# Move Filename column to first column: | |
t1 <- which(colnames(final_fqc_table) == "Filename") | |
final_fqc_table <- final_fqc_table[,c(t1,setdiff(1:dim(final_fqc_table)[2],t1))] | |
# Write output | |
write.table(file = out.basic,x = final_fqc_table,quote=F,row.names = F, sep="\t") | |
# Remove temporary files: | |
file.remove(temp.stats) | |
file.remove(temp.summary) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Outdated by the wonderful MultiQC program