Skip to content

Instantly share code, notes, and snippets.

@dyndna
Last active June 26, 2016 19:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dyndna/0fe211678316cc53370c to your computer and use it in GitHub Desktop.
Save dyndna/0fe211678316cc53370c to your computer and use it in GitHub Desktop.
gists for blog posts
#!/bin/bash
# Wrapper to run docker based variant calling pipeline for GLASS consortium
# v1.0.2
# @sbamin @sethsa
# usage
show_help() {
cat << EOF
Wrapper to run docker based variant calling pipeline for GLASS consortium
Required parameters: Unique sample name, file names for tumor and reference bam files, and GO to run pipeline on a compute node.
Please avoid naming file names and sample names having spaces and special characters.
Usage: ${0##*/} -s <UNIQUE SAMPLE NAME> -t <TUMOR BAM FILENAME> -n <NORMAL BAM FILENAME> -r <RUN MODE (DRY|GO; default: DRY)> -p <PATH TO GLASS BASE DIR (default:/home/mdauser/docktest)> | tee -a ~/glass_run.log"
-h display this help and exit
-s unique sample name (Required: This will be used in all output file names as well as docker container id)
-t tumor bam file name (Required: assuming this file is under <docker base director>/dockscratch/tmp/ path)
-n reference bam file name (Required: assuming this file is under <docker base director>/dockscratch/tmp/ path)
-r run mode: DRY or GO (default: DRY; GO will execute actual pipeline on a compute node)
-p path to GLASS base directory where all software and code dependencies are stored (default: /home/mdauser/docktest)
Example: ${0##*/} -p /home/mdauser/docktest -t C484.TCGA-06-0125-01A-01D-1490-08.6.bam -n C484.TCGA-06-0125-10A-01D-1490-08.6.bam -s TCGA-06-0125-01A-10A -r GO | tee -a ~/glass_run.log
EOF
}
if [[ $# == 0 ]];then show_help;exit 1;fi
while getopts "s:t:n:r:p:h" opt; do
case "$opt" in
h) show_help;exit 0;;
s) SAMPLE_UUID=$OPTARG;;
t) TMID=$OPTARG;;
n) NRID=$OPTARG;;
r) RUNMODE=$OPTARG;;
p) DOCKDIR=$OPTARG;;
'?')show_help >&2 exit 1 ;;
esac
done
DOCKDIR=${DOCKDIR:-"/home/mdauser/docktest"}
DOCKSCRATCH=${DOCKDIR}/dockscratch
DOCKOPT=${DOCKDIR}/dockopt
## exit if GLASS base directory is not present or does not meet required subdirectory structure.
if [[ ! -d ${DOCKDIR} || ! -d ${DOCKSCRATCH} || ! -d ${DOCKOPT} || ! -d ${DOCKSCRATCH}/docker_mutect || ! -d ${DOCKOPT}/bundle ]];then
echo -e "\n##### ERROR #####\nEither GLASS base directory does not exist at ${DOCKDIR} or GLASS base directory does not seem to have required subdirectory structure, i.e.,\nThere should be ${DOCKSCRATCH} and ${DOCKOPT} directories.\nYou can override default base directory with -p flag. See example below or README file present within GLASS code repository\n#################"
show_help
exit 1
fi
## exit if GLASS base directory does not mutect and/or GATK jar files at specified paths.
if [[ ! -s ${DOCKOPT}/gatk/GenomeAnalysisTK.jar || ! -s ${DOCKOPT}/mutect/mutect-1.1.7.jar || ! -s ${DOCKOPT}/mutect/muTect-1.1.4.jar ]];then
echo -e "\n##### ERROR #####\nMissing or zero-byte GATK and/or MuTect jar files at one or more of following paths:\n${DOCKOPT}/gatk/GenomeAnalysisTK.jar\n${DOCKOPT}/mutect/mutect-1.1.4.jar\n${DOCKOPT}/mutect/mutect-1.1.7.jar\nSee README file present within GLASS code repository\n#################"
show_help
exit 1
fi
## exit if sample name, tumor/reference bam file name is not specified with -s, -t and -n flag
if [[ -z ${SAMPLE_UUID} || -z ${TMID} || -z ${NRID} ]];then
echo -e "\n##### ERROR #####\nMissing one or more of following attributes: sample name, tumor bam file name and reference bam file name.\nYou have specified either nothing with -s, -t and -n flag or it is following:\n-s ${SAMPLE_UUID}\n-t ${TMID}\n-n ${NRID}\n#################"
show_help
exit 1
fi
## Check if tumor and reference bam files are present and of non-zero size
if [[ ! -s ${DOCKSCRATCH}/tmp/${TMID} || ! -s ${DOCKSCRATCH}/tmp/${NRID} ]];then
echo -e "\n##### ERROR #####\nMissing or zero-byte tumor and/or reference bam files at one or more of following paths:\nTumor bam specified using -t flag: ${DOCKSCRATCH}/tmp/${TMID}\nReference/Normal bam specified using -n flag: ${DOCKSCRATCH}/tmp/${NRID}\nSee README file present within GLASS code repository\n#################"
show_help
exit 1
fi
## make docker container ID
TSTAMP=$(date +%d%b%y_%H%M%S%Z)
DOCKID=$(printf "%s_%s" ${SAMPLE_UUID} ${TSTAMP})
## Check run mode argument supplied with -r:
RUNMODE=${RUNMODE:-DRY}
if [[ ${RUNMODE} == "DRY" ]];then
echo -e "\n##### INFO #####\ndocker run will be in dry mode because you have either not used -r flag or specified following: -r ${RUNMODE}\nTo run actual analysis pipeline, use -r GO option.\n"
#set flowr execute status
FLOWRUN="FALSE"
elif [[ ${RUNMODE} == "GO" ]]; then
echo -e "\n##### INFO #####\ndocker run will be in real mode and will submit jobs to a compute node.\nTo run in dry mode, use -r DRY option.\n"
#set flowr execute status
FLOWRUN="TRUE"
else
echo -e "\n##### ERROR #####\nIncorrect parameter to -r flag. It should be either DRY for dry run or GO to submit jobs to a compute node.\nYou have specified -r ${RUNMODE}\n#################"
show_help
exit 1
fi
## format docker run command:
DOCKCMD=$(printf "docker run -d --name %s -v %s:/scratch -v %s/docker_mutect:/scratch/docker_mutect:ro -v %s/bundle:/opt/bundle -v %s/gatk:/opt/gatk -v %s/mutect:/opt/mutect flowrbio/ultraseq:0.9.8 /bin/bash -c \"flowr run x=flowr_ultraseq mytm_bampath=\"/scratch/tmp/%s\" mynr_bampath=\"/scratch/tmp/%s\" sample_name=\"%s\" execute=%s\"" ${DOCKID} ${DOCKSCRATCH} ${DOCKSCRATCH} ${DOCKOPT} ${DOCKOPT} ${DOCKOPT} ${TMID} ${NRID} ${SAMPLE_UUID} ${FLOWRUN})
## Run docker run command with strict error checking:
set -e
echo -e "\n#### Now running following docker run command ####\nUnless docker run syntax is incorrect, detailed summary of running container will be displayed at the end.\n"
echo -e "${DOCKCMD}\n"
DOCKRUN=$(eval ${DOCKCMD})
EXITSTAT=$?
set +e
## PRINT SUMMARY IF COMMAND SUCCEEDS:
cat << EOF
####################################################################################
Summary of variables supplied to docker run command at ${TSTAMP}
SAMPLE UUID: ${SAMPLE_UUID}
Tumor bam path: ${DOCKSCRATCH}/tmp/${TMID}
Reference bam path: ${DOCKSCRATCH}/tmp/${NRID}
GLASS BASE DIRECTORY: ${DOCKDIR}
GLASS CODE DIRECTORY: ${DOCKSCRATCH}/docker_mutect
#### Docker run command ####
${DOCKCMD}
Above command was run with exit code: ${EXITSTAT}. This exit code does not indicate exit status of docker container. For the latter, use:
docker ps -a --filter "id=${DOCKRUN}"
############################
Docker container name: ${DOCKID}
Docker container ID: ${DOCKRUN}
You may check container status by:
docker ps -a --filter "id=${DOCKRUN}"
You may get container logs by:
docker logs ${DOCKID} 2> ~/docker_stdout.log
RUN TYPE: ${RUNMODE} or ${FLOWRUN} ; GO or TRUE means it will submit commands to a compute node.
If RUN TYPE was GO, variant calls and log files will be saved at following directory:
${DOCKSCRATCH}/flowr/${SAMPLE_UUID}_<UUID assigned by flowr>/
####################################################################################
EOF
## END ##
#!/bin/bash
# Wrapper to make BSUB job format on HPC running LSF job scheduler.
# @sbamin | shark
## getopts schema is modified from from script by @r_sabarinathan
# usage
show_help() {
cat << EOF
Wrapper to make BSUB job format on HPC running LSF job scheduler.
Only required parameter is path to file containing commands to be run on cluster.
This file will be copied verbatim following BSUB arguments.
Default BSUB options are: medium queue with 2 hours walltime, arpprox 8GB RAM and 4 CPU cores with present work directory as current work directory.
Usage: ${0##*/} -a <path to files containing commands> > <job.bsub>"
-h display this help and exit
-j job name (default: j<random id>_username)
-w work directory (default: present work directory)
-q job queue (default: medium)
-t walltime in hours (default: 2:00)
-m memory in KB and in multiple of 8192 (default: 8192)
-c cpu cores per node (default: 4)
-o email notifications (default: -N)
-e extra options to BSUB (default: none)
-a REQUIRED: path to file containing commands to be run on cluster. This file will be copied verbatim following BSUB arguments.
Example: ${0##*/} -j "sample_job" -w "/home/foo/myworkdir" -q long -t 26:00 -m 65536 -c 24 -o "-B -N" -a "/home/foo/mycommands.txt" > /home/foo/sample.bsub
Quotes are important for variable names containig spaces and special characters.
EOF
}
if [[ $# == 0 ]];then show_help;exit 1;fi
while getopts "j:w:q:t:m:c:o:e:a:h" opt; do
case "$opt" in
h) show_help;exit 0;;
j) JOBNAME=$OPTARG;;
w) CWD=$OPTARG;;
q) QUEUE=$OPTARG;;
t) WALLTIME=$OPTARG;;
m) MEMORY=$OPTARG;;
c) CPU=$OPTARG;;
o) EMAILOPTS=$OPTARG;;
e) EXTRA_OPTS=$OPTARG;;
a) MYARGS=$OPTARG;;
'?')show_help >&2 exit 1 ;;
esac
done
DJOBID=$(printf "j%s_%s" "$RANDOM" "$(whoami)")
JOBNAME=${JOBNAME:-$DJOBID}
CWD=${CWD:-$(pwd)}
STDOUT=$(printf "%s/log_%s.out" "${CWD}" "$JOBNAME")
STDERR=$(printf "%s/log_%s.err" "${CWD}" "$JOBNAME")
QUEUE=${QUEUE:-"medium"}
WALLTIME=${WALLTIME:-"2:00"}
MEMORY=${MEMORY:-"8192"}
CPU=${CPU:-"4"}
EMAILOPTS=${EMAILOPTS:-"-N"}
if [[ ! -s ${MYARGS} ]];then
echo -e "\nERROR: Command file either does not exist at ${MYARGS} location or empty.\n"
show_help
exit 1
fi
##### Following lsf block will be parsed based on arguments supplied #####
cat <<EOF
#!/bin/bash
#BSUB -J ${JOBNAME} # name of the job
#BSUB -cwd ${CWD} # the workding dir for each job, this is <flow_run_path>/uniqueid/tmp
#BSUB -o ${STDOUT} # output is sent to logfile, stdout + stderr by default
#BSUB -e ${STDERR} # output is sent to logfile, stdout + stderr by default
#BSUB -q ${QUEUE} # Job queue
#BSUB -W ${WALLTIME} # Walltime in minutes
#BSUB -M ${MEMORY} # Memory requirements in Kbytes
#BSUB -n ${CPU} # CPU reserved
#BSUB -R span[ptile=${CPU}] # CPU reserved, all reserved on same node
#BSUB -R rusage[mem=${MEMORY}] # memory reserved
#BSUB -u foo@example.com # for notifications
#BSUB ${EMAILOPTS} # send email when job ends
#BSUB -r # make the jobs re-runnable
#BSUB ${EXTRA_OPTS} # Any extra arguments passed onto queue
## following BSUB options are not being used at present.
##BSUB {{{DEPENDENCY}}} # Do not remove dependency args come here
## --- DO NOT EDIT from below here---- ##
## following will always overwrite previous output file, if any.
set +o noclobber
$(printf "echo \"BEGIN at \$(date)\" >> %s" "${STDOUT}")
## File containing commands will be copied here verbatim ##
###################### START USER SUPPLIED COMMANDS ######################
$(cat "${MYARGS}")
###################### END USER SUPPLIED COMMANDS ######################
exitstat=\$?
$(printf "echo \"END at \$(date)\" >> %s" "${STDOUT}")
$(printf "echo \"exit status was \${exitstat}\" >> %s" "${STDOUT}")
$(printf "exit \${exitstat}")
## END ##
EOF
#!/bin/bash
# Wrapper to make MSUB job format on HPC running Moab/Torque job scheduler.
# @sbamin | nautilus
## getopts schema is modified from from script by @r_sabarinathan
# usage
show_help() {
cat << EOF
Wrapper to make BSUB job format on HPC running Moab/Torque job scheduler.
Only required parameter is path to file containing commands to be run on cluster.
This file will be copied verbatim following MSUB arguments.
Default MSUB options are: medium queue with 2 hours walltime, arpprox 8GB RAM and 4 CPU cores with present work directory as current work directory.
Usage: ${0##*/} -a <path to files containing commands> > <job.msub>"
-h display this help and exit
-j job name (default: j<random id>_username)
-w work directory (default: present work directory)
-q job queue (default: medium)
-t walltime in HH:MM:SS (default: 02:00:00)
-m memory in gb (default: 8gb)
-n number of nodes (default: 1)
-c cpu cores per node (default: 4)
-o email notifications (default: ae)
-e extra options to MSUB (default: none)
-a REQUIRED: path to file containing commands to be run on cluster. This file will be copied verbatim following MSUB arguments.
Example: ${0##*/} -j "sample_job" -w "/home/foo/myworkdir" -q long -t 26:00:00 -m 64gb -n 1 -c 24 -o e -a "/home/foo/mycommands.txt" > /home/foo/sample.msub
Quotes are important for variable names containig spaces and special characters.
EOF
}
if [[ $# == 0 ]];then show_help;exit 1;fi
while getopts "j:w:q:t:m:n:c:o:e:a:h" opt; do
case "$opt" in
h) show_help;exit 0;;
j) JOBNAME=$OPTARG;;
w) CWD=$OPTARG;;
q) QUEUE=$OPTARG;;
t) WALLTIME=$OPTARG;;
m) MEMORY=$OPTARG;;
n) NODES=$OPTARG;;
c) CPU=$OPTARG;;
o) EMAILOPTS=$OPTARG;;
e) EXTRA_OPTS=$OPTARG;;
a) MYARGS=$OPTARG;;
'?')show_help >&2 exit 1 ;;
esac
done
DJOBID=$(printf "j%s_%s" "$RANDOM" "$(whoami)")
JOBNAME=${JOBNAME:-$DJOBID}
CWD=${CWD:-$(pwd)}
STDOUT=$(printf "%s/log_%s.out" "${CWD}" "$JOBNAME")
STDERR=$(printf "%s/log_%s.err" "${CWD}" "$JOBNAME")
QUEUE=${QUEUE:-"medium"}
WALLTIME=${WALLTIME:-"02:00:00"}
MEMORY=${MEMORY:-"8gb"}
NODES=${NODES:-"1"}
CPU=${CPU:-"4"}
EMAILOPTS=${EMAILOPTS:-"ae"}
if [[ ! -s ${MYARGS} ]];then
echo -e "\nERROR: Command file either does not exist at ${MYARGS} location or empty.\n"
show_help
exit 1
fi
##### Following lsf block will be parsed based on arguments supplied #####
cat <<EOF
#!/bin/bash
#MSUB -N ${JOBNAME} # name of the job
#MSUB -d ${CWD} # the workding dir for each job, this is <flow_run_path>/uniqueid/tmp
#MSUB -o ${STDOUT} # output is sent to logfile, stdout + stderr by default
#MSUB -e ${STDERR} # output is sent to logfile, stdout + stderr by default
#MSUB -q ${QUEUE} # Job queue
#MSUB -l walltime=${WALLTIME} # Walltime in minutes
#MSUB -l mem=${MEMORY} # Memory requirements in Kbytes
#MSUB -l nodes=${NODES}:ppn=${CPU} # CPU reserved
#MSUB -M foo@example.com # for notifications
#MSUB -m ${EMAILOPTS} # send email when job ends
#MSUB -r y # make the jobs re-runnable
#MSUB -S /bin/bash # use bash shell
#MSUB ${EXTRA_OPTS} # Any extra arguments passed onto queue
## following MSUB options are not being used at present.
# For HPC Nautilus at MDAnderson: Remove QUEUE option of MSUB else job will fail. Queue will be determined based on walltime argument.
##MSUB ${DEPENDENCY} # If requried, use dependency arg
## --- DO NOT EDIT from below here---- ##
## following will always overwrite previous output file, if any.
set +o noclobber
$(printf "echo \"BEGIN at \$(date)\" >> %s" "${STDOUT}")
## File containing commands will be copied here verbatim ##
###################### START USER SUPPLIED COMMANDS ######################
$(cat "${MYARGS}")
###################### END USER SUPPLIED COMMANDS ######################
exitstat=\$?
$(printf "echo \"END at \$(date)\" >> %s" "${STDOUT}")
$(printf "echo \"exit status was \${exitstat}\" >> %s" "${STDOUT}")
$(printf "exit \${exitstat}")
## END ##
EOF
## R script
# v 1.5.1 | 10-15 | @sbamin
### WARNING: Regex patterns with recursive file search - Test "within" dummy directory on dummy files first ###
# merge multiple tables, e.g., merge HTSeq or Cufflinks generated count files: select files and rename colnames based on regex pattern, exclude last few rows of HTSeq count files.
# Example: Takes 6 mandatory arguments; be careful with regex!
# Rscript ../merge_tables_datatable.R <workdir> <filename_regex> <colnames_regex> <output_name> <number_of_cores_to_use> <number_of_rows_to_read>
# Rscript ../merge_tables_datatable.R . .*STAR.*\\.cnt$ TCGA.*v1 STAR 6 112171 | tee -a ../merge_rscript.log
args <- commandArgs(TRUE)
if(length(args) < 4) stop(sprintf("Incomplete arguments. Minimum four arguments required. See code header for details."))
print(sprintf("#### Supplied arguments are: %s ####", args))
mypath = as.character(args[1])
myfileext = as.character(args[2])
myfilename = as.character(args[3])
myout = as.character(args[4])
mycores = as.numeric(args[5])
if(is.na(mycores)) (mycores = 2)
if(is.na(as.numeric(args[5]))) warning(sprintf("You have either not supplied or given following as number of CPUs to use: %s, resetting to use 2 threads: %s",
as.numeric(args[5]), mycores))
mynrows = as.numeric(args[6])
if(is.na(mynrows)) (mynrows = -1L)
if(is.na(as.numeric(args[6]))) warning(sprintf("You have either not supplied or given following as number of rows to keep option: %s, resetting to use all rows: %s",
as.numeric(args[6]), mynrows))
print(paste0("############################"))
library(uuid)
batch_uuid <- UUIDgenerate()
outFile <- sprintf("%s/%s_%s_%s.Rout", mypath, myout, make.names(format(Sys.time(),"%b_%d_%Y_%H_%M_%S_%Z")), batch_uuid)
#if (!isTRUE(file.info(dirname(outFile))$isdir)) dir.create(dirname(outFile), recursive=TRUE)
sink(outFile, split = TRUE)
print(sessionInfo())
print(sprintf("Project name is %s and batch ID is %s", myout, batch_uuid))
setwd(mypath)
print(sprintf("Work dir was %s", getwd()))
my_merge_files <- function (my_path = mypath, my_fileext = myfileext, my_filename = myfilename, my_out=myout, my_nrows = mynrows, my_cores = mycores) {
setwd(my_path)
print(sprintf("## Work dir is %s ##", getwd()))
##1. identify files to read in
filesToProcess <- list.files(path = my_path, pattern = my_fileext, full.names = T, recursive = T)
print(sprintf("## Files to be merged are: ##"))
print(filesToProcess)
print(paste0("############################"))
##2. Iterate over each of those file names with lapply
library(stringr)
library(parallel)
library(data.table)
print(sprintf("######### file read START ######### %s", format(Sys.time(),"%b_%d_%Y_%H_%M_%S_%Z")))
#listOfFiles <- mclapply(filesToProcess, function(x) read.delim(x, header=F, col.names = c("rowid", sprintf("%s", str_extract(basename(x), pattern = my_filename))), colClasses = c("character", "numeric"), nrows = my_nrows), mc.preschedule = F, mc.cores = my_cores)
listOfFiles <- mclapply(filesToProcess, function(x) {
mydf = fread(x, header=F, colClasses = c("character", "numeric"), nrows = my_nrows)
setnames(mydf, colnames(mydf), c("rowid", sprintf("%s", str_extract(basename(x), pattern = my_filename))))
}, mc.preschedule = F, mc.cores = my_cores)
print(sprintf("## Format of files following fread function: ##"))
print(lapply(1:length(listOfFiles), function(m) colnames(listOfFiles[[m]])))
print(sprintf("## header of first two files in list: ##"))
print(head(listOfFiles[[1]]))
print(head(listOfFiles[[2]]))
print(sprintf("## tail of first two files in list: ##"))
print(tail(listOfFiles[[1]]))
print(tail(listOfFiles[[2]]))
print(sprintf("######### file read END ######### %s", format(Sys.time(),"%b_%d_%Y_%H_%M_%S_%Z")))
##3. Merge all of the objects in the list together with Reduce or data.table::merge.data.table.
print(sprintf("######### merge START ######### %s", format(Sys.time(),"%b_%d_%Y_%H_%M_%S_%Z")))
# x is the common columns to join on
## Using base function, merge:
# out_cnt <- Reduce(function(x,y) {merge.data.frame(x,y, by = "rowid")}, listOfFiles)
## for faster and less memory usage, specify hidden function, data.table:::merge.data.table - although I've tested this function and works well, double check final merged output for discrepency, if any.
out_cnt <- Reduce(function(x,y) {data.table:::merge.data.table(x,y, by = "rowid")}, listOfFiles)
print(sprintf("## Merged table has %s rows and %s samples ##", dim(out_cnt)[1], dim(out_cnt)[2]-1))
print(sprintf("Column names are: %s", paste(colnames(out_cnt), collapse = ", ")))
out_cnt[1:3,1:3]
if(length(grep(".*v1\\.[a-zA-Z]", colnames(out_cnt), perl = T)) > 0)
warning(sprintf("Duplicate data based on column names was found: %s\n", sort(grep(".*v1\\.[a-zA-Z]", colnames(out_cnt), perl = T, value = T))))
print(sprintf("######### merge END ######### %s", format(Sys.time(),"%b_%d_%Y_%H_%M_%S_%Z")))
##4. Export merged table in tsv and rds format
write.table(out_cnt, sprintf("merged_%s_%s.tsv", make.names(format(Sys.time(),"%b_%d_%Y_%H_%M_%S_%Z")), my_out), quote=F, sep="\t", row.names=F)
saveRDS(out_cnt, file = sprintf("merged_%s_%s.rds", make.names(format(Sys.time(),"%b_%d_%Y_%H_%M_%S_%Z")), my_out))
print(sprintf("Exported merged table within work directory in tsv and rds format with file name merged_%s_%s", make.names(format(Sys.time(),"%b_%d_%Y_%H_%M_%S_%Z")), my_out))
}
print(sprintf("######### MERGE FUNCTION START ######### %s", format(Sys.time(),"%b_%d_%Y_%H_%M_%S_%Z")))
my_merge_files(my_path = mypath, my_fileext = myfileext, my_filename = myfilename, my_out=myout, my_nrows = mynrows, my_cores = mycores)
print(sprintf("######### MERGE FUNCTION COMPLETE ######### %s", format(Sys.time(),"%b_%d_%Y_%H_%M_%S_%Z")))
print(sessionInfo())
#END
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment