Skip to content

Instantly share code, notes, and snippets.

@philippbayer
Last active November 6, 2023 01:41
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save philippbayer/79256a6e59df6a52e8e4ac90e7b48a7e to your computer and use it in GitHub Desktop.
Save philippbayer/79256a6e59df6a52e8e4ac90e7b48a7e to your computer and use it in GitHub Desktop.
My current Pawsey nextflow.config
// have this as nextflow.config in the folder of your run for Pawseys Setonix
// i settled on this command for nf-core/mag:
// nextflow run nf-core/mag --input '*{R{1,2}.fastq.gz' --outdir results
// --skip_spades --cat_db https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz
// --gtdb 'https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz'
// -resume -profile singularity
// --refine_bins_dastool --postbinning_input both
// --busco_download_path /SOMEWHERE/busco-data.ezlab.org/v5/data
// --disable-jobs-cancellation
// disable-jobs-cancellation is super-useful for the 'main' submitting job on Setonix, as there's a 24H walltime limit.
// if that limit is hit the main job is canceled and all currently running jobs are, too. This setting avoids that the currently-running
// sub-jobs are canceled.
// I download the busco data manually from busco-data.ezlab.org/v5/data, then extract all downloaded tar.gz files in their folders
// - else each BUSCO job will download lineage data separately and if you have thousands it's going to be weid
//
// I use DAStool, and I use both unrefined and DAS outputs - DAS removes some noise but some weirder outliers may be removed.
// Good to look at both
//
// I usually skip SPADES as it adds a lot of time but results in (for me, so far) worse assemblies
//
// BUSCO and KRONA make tons and tons of files, which is why I set their scratch folder to /tmp. If either job crashes,
// remove that line as /tmp will be on HPC nodes and you might not see important logs on the login node.
// Same for GTDBTK - /tmp on these nodes is memory, so sometimes jobs can run out of 'space' when in reality, they ran out of memory.
// in these cases, replace /tmp with something else or just delete the line
// Replace $ACCOUNT by whatever your Pawsey account group is
// IMPORTANT: include this line in the SLURM script submitting hte master nextflow job:
// unset SBATCH_EXPORT
// So the child jobs will have the Singularity module loaded. See https://support.pawsey.org.au/documentation/display/US/Nextflow
cleanup = true
resume = true
process {
cache = 'lenient'
singularity.enabled = true
singularity.autoMounts = true
stageInMode = 'symlink'
module = 'singularity/3.11.4-nompi'
singularity {
enabled = true
envWhitelist = 'SINGULARITY_BINDPATH, SINGULARITYENV_LD_LIBRARY_PATH, SINGULARITYENV_LD_PRELOAD'
}
executor='slurm'
queue = {task.memory < 110.GB ? 'work' : 'highmem'}
clusterOptions = ' -A $ACCOUNT '
withName: 'BOWTIE2_ASSEMBLY_ALIGN|BOWTIE2_PHIX_REMOVAL_ALIGN' {
cpus = { 36 }
memory = { 50.GB }
time = { 16.h }
errorStrategy = { task.exitStatus in [143,137,104,134,139,247] ? 'retry' : 'finish' }
}
withName: PROKKA {
cpus = { 36 }
time = { 16.h }
errorStrategy = { task.exitStatus in [143,137,104,134,139,247] ? 'retry' : 'finish' }
}
withName: MEGAHIT {
cpus = {36}
time = {24.h}
memory = {200.GB}
queue = {'highmem'}
}
withName: 'BUSCO|KRONA' {
scratch = '/tmp'
}
withName: BOWTIE2_ASSEMBLY_BUILD {
memory = 50.GB
}
withName: GTDBTK_CLASSIFY {
queue = 'highmem'
time = { 24.h}
memory = 300.GB
scratch = '/tmp'
}
withName: BUSCO {
memory = 50.GB
}
}
executor {
queueSize=100
$slurm {
pollInterval = '1 min'
queueStatInterval = '5 min'
}
$local {
pollInterval = '2 sec'
}
}
params {
max_cpus = 36
max_time = 24.h
}
env {
PYTHONNOUSERSITE = 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment