Skip to content

Instantly share code, notes, and snippets.

@peterk87
Last active May 23, 2024 21:25
Show Gist options
  • Save peterk87/54e29441a741265af7c98df1675b330f to your computer and use it in GitHub Desktop.
Save peterk87/54e29441a741265af7c98df1675b330f to your computer and use it in GitHub Desktop.
CFIA-NCFAD/nf-villumina Bash wrapper script
#!/bin/bash
# Function to handle Ctrl+C
handle_interrupt() {
echo -e "\n\033[1;31mERROR:\033[1m Script interrupted by user (Ctrl+C)\033[0m"
exit 1
}
# Set up the trap to catch SIGINT (Ctrl+C)
trap handle_interrupt SIGINT
# Define default values
TOTAL_CPUS=$(nproc)
DEFAULT_KRAKEN2_DB="/opt/DB/kraken2/nt_20231129"
DEFAULT_CENTRIFUGE_DB="/opt/DB/centrifuge/nt-2020-02-04/nt"
DEFAULT_BLASTN_NT_DB="/opt/DB/blast/nt/nt"
# Get the amount of free memory in KB, convert to GB
FREE_MEM_KB=$(awk '/MemAvailable/ {print $2}' /proc/meminfo)
FREE_MEM_GB=$(echo "scale=2; $FREE_MEM_KB / 1024 / 1024" | bc)
# Function to print usage information
usage() {
echo "Usage: $0 [-h] [-n NCPUS] [-m MEM_TO_USE_GB] [BLASTN_NT_DB] [KRAKEN2_DB] [CENTRIFUGE_DB]"
echo "Options:"
echo " -h Display this help message"
echo " -n NCPUS Number of CPUs to use (default: total CPUs; $TOTAL_CPUS)"
echo " -m MEM_TO_USE_GB Amount of memory to use in GB (default: free memory; $FREE_MEM_GB GB)"
echo "Arguments:"
echo " BLASTN_NT_DB Path to BLASTN NT database (default: $DEFAULT_BLASTN_NT_DB)"
echo " KRAKEN2_DB Path to Kraken2 database (default: $DEFAULT_KRAKEN2_DB)"
echo " CENTRIFUGE_DB Path to Centrifuge database (default: $DEFAULT_CENTRIFUGE_DB)"
exit 0
}
# Parse command-line options
while getopts ":hn:m:" opt; do
case ${opt} in
h)
usage
;;
n)
NCPUS=$OPTARG
;;
m)
MEM_TO_USE_GB=$OPTARG
;;
\?)
echo "Invalid option: -$OPTARG" >&2
usage
;;
:)
echo "Option -$OPTARG requires an argument." >&2
usage
;;
esac
done
shift $((OPTIND -1))
NCPUS=${NCPUS:-$TOTAL_CPUS}
HALF_CPUS=$((NCPUS / 2))
MEM_TO_USE_GB=${MEM_TO_USE_GB:-$FREE_MEM_GB}
# Set default database paths
BLASTN_NT_DB=${1:-$DEFAULT_BLASTN_NT_DB}
KRAKEN2_DB=${2:-$DEFAULT_KRAKEN2_DB}
CENTRIFUGE_DB=${3:-$DEFAULT_CENTRIFUGE_DB}
error() {
echo -e "$(date -Is) \033[1;31mERROR: \033[0m\033[1m$1\033[0m"
}
info() {
echo -e "$(date -Is) \033[1;32mINFO: \033[0m\033[1m$1\033[0m"
}
# Check if the reads directory exists and contains at least two .fastq.gz files
if [ ! -d "reads/" ] || [ $(ls reads/*.fastq.gz 2>/dev/null | wc -l) -lt 2 ]; then
error "'reads/' directory does not exist or contains fewer than two .fastq.gz files!"
exit 1
fi
info "Using ${NCPUS} CPUs and ${MEM_TO_USE_GB} GB for nf-villumina analysis"
info "Creating nf-villumina.environment.yml"
cat > nf-villumina.environment.yml <<EOL
name: nf-villumina
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- bbmap
- blast
- centrifuge-core
- curl
- fastp
- fastqc
- kraken2
- mash
- megahit
- pbgzip
- samtools
- seqtk
- shovill
- spades
- unicycler
- openjdk=21
- python
- pip
- pip:
- filter_classified_reads
- biopython
- click
- attrs
- numpy
- pandas
EOL
info "Creating nf-villumina Conda env"
conda env create -f nf-villumina.environment.yml
# Source the conda.sh script to use conda in the script
source "$(conda info --base)/etc/profile.d/conda.sh"
# Activate the conda environment
conda activate nf-villumina
# Check if the environment was activated successfully
if [[ "$CONDA_DEFAULT_ENV" == "nf-villumina" ]]; then
echo "Conda environment nf-villumina activated!"
else
echo "Failed to activate conda environment nf-villumina."
exit 1
fi
info "Exporting current Conda env for debugging"
conda env export | tee -a nf-villumina.conda-env.yml
info "Creating custom config for large centrifuge and kraken2 indexes"
cat > nf-villumina.big-index.config <<EOL
trace.overwrite = true
dag.overwrite = true
report.overwrite = true
timeline.overwrite = true
process {
withName:CENTRIFUGE {
errorStrategy = 'retry'
cpus = 1
memory = 300.GB
time = '2d'
}
withName:KRAKEN2 {
errorStrategy = 'retry'
cpus = ${HALF_CPUS}
memory = ${NCPUS}.GB
time = '3d'
}
}
EOL
TAXIDLIST="$(date -I)-viruses-10239.taxidlist"
if [ -f $TAXIDLIST ]; then
info "Taxid list file '$TAXIDLIST' already exists. Skipping get_species_taxids.sh step..."
else
info "Getting latest viruses taxids from NCBI with 'get_species_taxids.sh' and outputting to $TAXIDLIST"
get_species_taxids.sh -t 10239 > $TAXIDLIST
fi
info "Pulling latest version of nf-villumina"
nextflow pull CFIA-NCFAD/nf-villumina
info "Running nf-villumina"
nextflow run CFIA-NCFAD/nf-villumina \
-c nf-villumina.big-index.config \
-resume \
--reads "reads/*R{1,2}*.fastq.gz" \
--blastn_db $BLASTN_NT_DB \
--centrifuge_db $CENTRIFUGE_DB \
--kraken2_db $KRAKEN2_DB \
--blastn_taxids $TAXIDLIST \
--max_cpus $NCPUS --max_memory "${MEM_TO_USE_GB} GB"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment