philippbayer/nextflow.config

## nextflow.config
// have this as nextflow.config in the folder of your run for Pawseys Setonix

// i settled on this command for nf-core/mag:
// nextflow run nf-core/mag --input '*{R{1,2}.fastq.gz' --outdir results
// --skip_spades --cat_db https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz
// --gtdb 'https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz'
// -resume -profile singularity
// --refine_bins_dastool --postbinning_input both
// --busco_download_path /SOMEWHERE/busco-data.ezlab.org/v5/data
// --disable-jobs-cancellation

// disable-jobs-cancellation is super-useful for the 'main' submitting job on Setonix, as there's a 24H walltime limit.
// if that limit is hit the main job is canceled and all currently running jobs are, too. This setting avoids that the currently-running
// sub-jobs are canceled.

// I download the busco data manually from busco-data.ezlab.org/v5/data, then extract all downloaded tar.gz files in their folders
// - else each BUSCO job will download lineage data separately and if you have thousands it's going to be weid
//
// I use DAStool, and I use both unrefined and DAS outputs - DAS removes some noise but some weirder outliers may be removed.
// Good to look at both
//
// I usually skip SPADES as it adds a lot of time but results in (for me, so far) worse assemblies
//
// BUSCO and KRONA make tons and tons of files, which is why I set their scratch folder to /tmp. If either job crashes,
// remove that line as /tmp will be on HPC nodes and you might not see important logs on the login node.
// Same for GTDBTK - /tmp on these nodes is memory, so sometimes jobs can run out of 'space' when in reality, they ran out of memory.
// in these cases, replace /tmp with something else or just delete the line

// Replace $ACCOUNT by whatever your Pawsey account group is

// IMPORTANT: include this line in the SLURM script submitting hte master nextflow job:
// unset SBATCH_EXPORT
// So the child jobs will have the Singularity module loaded. See https://support.pawsey.org.au/documentation/display/US/Nextflow

cleanup = true
resume = true

process {
    cache = 'lenient'
    singularity.enabled = true
    singularity.autoMounts = true
    stageInMode = 'symlink'
    module = 'singularity/4.1.0-nompi'
    singularity {
        enabled = true
        envWhitelist = 'SINGULARITY_BINDPATH, SINGULARITYENV_LD_LIBRARY_PATH, SINGULARITYENV_LD_PRELOAD'
    }

    executor='slurm'
    queue = {task.memory < 110.GB ? 'work' : 'highmem'}
    clusterOptions = "--account=${System.getenv('PAWSEY_PROJECT')}"

    withName: 'BOWTIE2_ASSEMBLY_ALIGN|BOWTIE2_PHIX_REMOVAL_ALIGN' {
        cpus          = { 36 }
        memory        = { 50.GB }
        time          = { 16.h }
        errorStrategy = { task.exitStatus in [143,137,104,134,139,247] ? 'retry' : 'finish' }
    }
    withName: PROKKA {
        cpus          = { 36 }
        time          = { 16.h }
        errorStrategy = { task.exitStatus in [143,137,104,134,139,247] ? 'retry' : 'finish' }

    }
    withName: MEGAHIT {
        cpus = {36}
        time = {24.h}
        memory = {200.GB}
        queue = {'highmem'}
    }

    // Some stuff for epi2me
    withName: fastcat {
        cpus          = { 36 }
        time          = { 16.h }
    }
    withName: 'kraken_pipeline:run_kraken2|run_kraken2' {
        cpus = {36}
        time = {24.h}
        memory = {700.GB}
        queue = {'highmem'}
    }
    withName: 'BUSCO|KRONA' {
        scratch = '/tmp'
    }

    withName: BOWTIE2_ASSEMBLY_BUILD {
        memory = 50.GB
    }
    withName: GTDBTK_CLASSIFY {
        queue = 'highmem'
        time = { 24.h}
        memory = 300.GB
        scratch = '/tmp'
    }
    withName: BUSCO {
        memory = 50.GB
    }

}
    executor {
    queueSize=100
            $slurm {
                pollInterval = '1 min'
                queueStatInterval = '5 min'
            }
        $local {
            pollInterval = '2 sec'
        }
    }

    params {
        max_cpus = 36
        max_time = 24.h
    }

env {
      PYTHONNOUSERSITE = 1
}
	// have this as nextflow.config in the folder of your run for Pawseys Setonix

	// i settled on this command for nf-core/mag:
	// nextflow run nf-core/mag --input '*{R{1,2}.fastq.gz' --outdir results
	// --skip_spades --cat_db https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz
	// --gtdb 'https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz'
	// -resume -profile singularity
	// --refine_bins_dastool --postbinning_input both
	// --busco_download_path /SOMEWHERE/busco-data.ezlab.org/v5/data
	// --disable-jobs-cancellation

	// disable-jobs-cancellation is super-useful for the 'main' submitting job on Setonix, as there's a 24H walltime limit.
	// if that limit is hit the main job is canceled and all currently running jobs are, too. This setting avoids that the currently-running
	// sub-jobs are canceled.

	// I download the busco data manually from busco-data.ezlab.org/v5/data, then extract all downloaded tar.gz files in their folders
	// - else each BUSCO job will download lineage data separately and if you have thousands it's going to be weid
	//
	// I use DAStool, and I use both unrefined and DAS outputs - DAS removes some noise but some weirder outliers may be removed.
	// Good to look at both
	//
	// I usually skip SPADES as it adds a lot of time but results in (for me, so far) worse assemblies
	//
	// BUSCO and KRONA make tons and tons of files, which is why I set their scratch folder to /tmp. If either job crashes,
	// remove that line as /tmp will be on HPC nodes and you might not see important logs on the login node.
	// Same for GTDBTK - /tmp on these nodes is memory, so sometimes jobs can run out of 'space' when in reality, they ran out of memory.
	// in these cases, replace /tmp with something else or just delete the line

	// Replace $ACCOUNT by whatever your Pawsey account group is

	// IMPORTANT: include this line in the SLURM script submitting hte master nextflow job:
	// unset SBATCH_EXPORT
	// So the child jobs will have the Singularity module loaded. See https://support.pawsey.org.au/documentation/display/US/Nextflow

	cleanup = true
	resume = true

	process {
	cache = 'lenient'
	singularity.enabled = true
	singularity.autoMounts = true
	stageInMode = 'symlink'
	module = 'singularity/4.1.0-nompi'
	singularity {
	enabled = true
	envWhitelist = 'SINGULARITY_BINDPATH, SINGULARITYENV_LD_LIBRARY_PATH, SINGULARITYENV_LD_PRELOAD'
	}

	executor='slurm'
	queue = {task.memory < 110.GB ? 'work' : 'highmem'}
	clusterOptions = "--account=${System.getenv('PAWSEY_PROJECT')}"

	withName: 'BOWTIE2_ASSEMBLY_ALIGN\|BOWTIE2_PHIX_REMOVAL_ALIGN' {
	cpus = { 36 }
	memory = { 50.GB }
	time = { 16.h }
	errorStrategy = { task.exitStatus in [143,137,104,134,139,247] ? 'retry' : 'finish' }
	}
	withName: PROKKA {
	cpus = { 36 }
	time = { 16.h }
	errorStrategy = { task.exitStatus in [143,137,104,134,139,247] ? 'retry' : 'finish' }

	}
	withName: MEGAHIT {
	cpus = {36}
	time = {24.h}
	memory = {200.GB}
	queue = {'highmem'}
	}

	// Some stuff for epi2me
	withName: fastcat {
	cpus = { 36 }
	time = { 16.h }
	}
	withName: 'kraken_pipeline:run_kraken2\|run_kraken2' {
	cpus = {36}
	time = {24.h}
	memory = {700.GB}
	queue = {'highmem'}
	}
	withName: 'BUSCO\|KRONA' {
	scratch = '/tmp'
	}

	withName: BOWTIE2_ASSEMBLY_BUILD {
	memory = 50.GB
	}
	withName: GTDBTK_CLASSIFY {
	queue = 'highmem'
	time = { 24.h}
	memory = 300.GB
	scratch = '/tmp'
	}
	withName: BUSCO {
	memory = 50.GB
	}

	}
	executor {
	queueSize=100
	$slurm {
	pollInterval = '1 min'
	queueStatInterval = '5 min'
	}
	$local {
	pollInterval = '2 sec'
	}
	}

	params {
	max_cpus = 36
	max_time = 24.h
	}

	env {
	PYTHONNOUSERSITE = 1
	}