Skip to content

Instantly share code, notes, and snippets.

@sklarz-bgu
Created February 8, 2017 11:14
Show Gist options
  • Save sklarz-bgu/33f494f6f0bfe12d782e131b5aa9cf4a to your computer and use it in GitHub Desktop.
Save sklarz-bgu/33f494f6f0bfe12d782e131b5aa9cf4a to your computer and use it in GitHub Desktop.
Return tabular version of fastqc results for multiple samples.
library("magrittr")
library("optparse")
args = commandArgs(trailingOnly=TRUE)
option_list = list(
make_option(c("-d", "--dir"), type="character", default=NULL,
help="Directory of fastqc results", metavar="character"),
make_option(c("-o", "--outdir"), type="character", default=NULL,
help="An existing dir in which to create the output files.", metavar="character")
);
opt_parser = optparse::OptionParser(usage = "usage: %prog [options]", option_list=option_list,epilogue="\n\nAuthor: Menachem Sklarz");
opt = optparse::parse_args(opt_parser);
if (is.null(opt$outdir)){
opt$outdir <- dirname(normalizePath(opt$dir))
exit(cat("No --outdir was passed!"))
}
out.dir <- opt$outdir
zipdir <- opt$dir
out.summary <- sprintf("%s/%s",out.dir,"fastqc_summary.txt")
out.basic <- sprintf("%s/%s",out.dir,"fastqc_BasicStatistics.txt")
temp.summary <- sprintf("%s/%s",out.dir,"summary.txt")
temp.stats <- sprintf("%s/%s",out.dir,"fastqc_data.txt")
# Get list of zip files:
zipf_list <- dir(zipdir,recursive = T) %>%
grep(pattern="zip",value=T) %>%
paste(zipdir,.,sep="")
cat("Summarizing tests...\n")
# Extract and summarize test statistics:
for (i in 1:length(zipf_list)) {
zipf = zipf_list[i]
cat(sprintf(" File... %s\n",zipf))
# List files in a zipped file:
archive = unzip(zipf,list=T)[,1] %>%
grep(pattern="summary.txt",value=T)
unzip(zipf,
files = archive,
junkpaths = T,
exdir = out.dir)
fastqc_table = read.delim(temp.summary,
header = F,
stringsAsFactors = F)
names(fastqc_table) <- c(fastqc_table[1,3],"test")
fastqc_table <- fastqc_table[,-3]
if (i==1) {
final_fqc_table = fastqc_table
} else {
final_fqc_table <- merge(final_fqc_table, fastqc_table, by="test")
}
}
final_fqc_table <- t(final_fqc_table)
colnames(final_fqc_table) <- final_fqc_table[1,]
final_fqc_table <- final_fqc_table[-1,]
write.table(file = out.summary, x = final_fqc_table,quote=F,row.names = T, sep="\t")
cat("Summarizing basic stats...\n")
# Extract and summarize test statistics:
for (i in 1:length(zipf_list)) {
zipf = zipf_list[i]
cat(sprintf(" File... %s\n",zipf))
# List files in a zipped file:
archive = unzip(zipf,list=T)[,1] %>%
grep(pattern="fastqc_data.txt",value=T)
unzip(zipf,
files = archive,
junkpaths = T,
exdir = out.dir)
fastqc_table = read.delim(temp.stats,
header = F,
stringsAsFactors = F)
names(fastqc_table) <- c("param","value")
# Get range of indeices containng the size distribution data:
size_dist_range = (which((fastqc_table$param==">>Sequence Length Distribution"))+1):
(which(fastqc_table$param==">>END_MODULE")[min(which(which(fastqc_table$param==">>END_MODULE")>
which(fastqc_table$param==">>Sequence Length Distribution")))]-1)
# Get the actual part of the table:
size_dist_table <- fastqc_table[size_dist_range,]
# Get the max part of the bin:
# if size_dist_table is of length 1 (w/o header), then all seqs are same length. Treat separately:
if (length(size_dist_range)>2){
bins = lapply(X = strsplit(size_dist_table$param[-1],split = "-",fixed = T),FUN = function(x) x[2]) %>% unlist() %>% as.numeric()
# Get the value:
vals = size_dist_table$value[-1] %>% as.numeric
# Agglomerate to nearest 50bp (may be problem with bin edges...)
size_dist_table <- tapply(X=vals, INDEX=as.factor(50-bins%%50+bins), FUN=sum)
size_dist_table <- data.frame(param=names(size_dist_table),value=size_dist_table)
size_dist_table$param <- sprintf("L%03d-%03d",as.numeric(as.character(size_dist_table$param))-49,as.numeric(as.character(size_dist_table$param)))
} else {
bins = size_dist_table$param[-1] %>% as.numeric()
# Get the value:
vals = size_dist_table$value[-1] %>% as.numeric
# Agglomerate to nearest 50bp (may be problem with bin edges...)
size_dist_table <- data.frame(param=paste("L",bins,sep=""),value=vals)
}
fastqc_table <- fastqc_table[fastqc_table$param %in%
c("Filename","File type","Encoding",
"Total Sequences","Sequences flagged as poor quality",
"Sequence length","%GC"),]
names(size_dist_table) <- c("param",fastqc_table$value[fastqc_table$param=="Filename"])
names(fastqc_table) <- c("param",fastqc_table$value[fastqc_table$param=="Filename"])
fastqc_table <- rbind(fastqc_table,size_dist_table)
# Get length distribution lines -> table
# Apply -> bins of length 50bp
# Add bins to end of column
if (i==1) {
final_fqc_table = fastqc_table
} else {
final_fqc_table <- merge(final_fqc_table, fastqc_table, by="param",all = T)
}
}
# Transpose:
final_fqc_table <- t(final_fqc_table)
# Column headers are in first row:
colnames(final_fqc_table) <- final_fqc_table[1,]
final_fqc_table <- final_fqc_table[-1,]
# Move Filename column to first column:
t1 <- which(colnames(final_fqc_table) == "Filename")
final_fqc_table <- final_fqc_table[,c(t1,setdiff(1:dim(final_fqc_table)[2],t1))]
# Write output
write.table(file = out.basic,x = final_fqc_table,quote=F,row.names = F, sep="\t")
# Remove temporary files:
file.remove(temp.stats)
file.remove(temp.summary)
@sklarz-bgu
Copy link
Author

Outdated by the wonderful MultiQC program

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment