Created
October 25, 2021 16:39
-
-
Save evanroyrees/f0295617483324f3fd5e5c269d46e071 to your computer and use it in GitHub Desktop.
Template slurm submission script to run autometa pipeline
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
#SBATCH -p partition | |
#SBATCH -t 48:00:00 | |
#SBATCH --nodes=1 | |
#SBATCH --ntasks-per-node=16 | |
# NOTE: To create the conda environment for autometa you can supply the Makefile command: | |
# make create_environment | |
# Now activate the created conda env | |
# conda activate autometa | |
# NOTE: To install autometa in the created conda environment you can supply the Makefile command: | |
# make install | |
# Filepaths | |
assembly="Path to metagenome assembly fasta file" | |
bam="Path to metagenome read alignments.bam" | |
orfs="Path to orfs used as input to diamond blast" | |
blast="Path to diamond output file (outfmt 6)" | |
ncbi="Path to NCBI databases directory" | |
# Autometa Parameters | |
length_cutoff=3000 # in bp | |
# kmer counting, normalization transform and embedding method | |
kmer_size=5 | |
norm_method="am_clr" # choices: am_clr, clr, ilr | |
pca_dimensions=50 # NOTE: must be greater than $embed_dimensions | |
embed_dimensions=2 # NOTE: must be less than $pca_dimensions | |
embed_method="bhsne" # choices: bhsne, sksne, umap | |
# Binning clustering method | |
cluster_method="hdbscan" # choices: hdbscan, dbscan | |
# Binning metrics cutoffs | |
completeness=20.0 | |
purity=95.0 | |
cov_stddev_limit=25.0 | |
gc_stddev_limit=5.0 | |
cpus=16 | |
seed=42 | |
# Step 0: Do some Path handling with the provided `assembly` filepath | |
simpleName="TemplateAssemblyName" | |
outdir="AutometaOutdir" | |
if [ ! -d $outdir ] | |
then mkdir -p $outdir | |
fi | |
######### BEGIN ######### | |
# Step 1: filter assembly by length and retrieve contig lengths as well as GC content | |
# input: | |
# $assembly --> User input | |
# $length_cutoff --> User input | |
# output: | |
filtered_assembly="${outdir}/${simpleName}.filtered.fna" | |
gc_content="${outdir}/${simpleName}.gc_content.tsv" | |
# script: | |
autometa-length-filter \ | |
--assembly $assembly \ | |
--cutoff $length_cutoff \ | |
--output-fasta $filtered_assembly \ | |
--output-gc-content $gc_content | |
# Step 2: Determine coverages from assembly read alignments | |
# input: | |
# NOTE: $bam is defined at top and the rest of the inputs are generated by autometa | |
# output: | |
bed="${outdir}/${simpleName}.coverages.bed.tsv" | |
coverages="${outdir}/${simpleName}.coverages.tsv" | |
# script: | |
autometa-bedtools-genomecov --ibam $bam --bed $bed --output $coverages | |
# Step 3: Annotate and filter markers | |
# input: | |
# $orfs --> User input | |
# $cpus --> User input | |
# $seed --> User input | |
kingdoms=(bacteria archaea) | |
# NOTE: We iterate through both sets of markers for binning both bacterial and archeal kingdoms | |
for kingdom in ${kingdoms[@]};do | |
# kingdom-specific output: | |
hmmscan="${outdir}/${simpleName}.${kingdom}.hmmscan.tsv" | |
markers="${outdir}/${simpleName}.${kingdom}.markers.tsv" | |
# script: | |
autometa-markers \ | |
--orfs $orfs \ | |
--hmmscan $hmmscan \ | |
--out $markers \ | |
--kingdom $kingdom \ | |
--parallel \ | |
--cpus 4 \ | |
--seed $seed | |
done | |
# Step 4.1: Determine ORF lowest common ancestor (LCA) amongst top hits | |
# input: | |
# $blast --> User Input | |
# $ncbi --> User Input | |
# output: | |
lca="${outdir}/${simpleName}.orfs.lca.tsv" | |
# script: | |
autometa-taxonomy-lca --blast $blast --dbdir $ncbi --output $lca | |
# Step 4.2: Perform Modified Majority vote of ORF LCAs for all contigs that returned hits in blast search | |
# input: | |
# $lca --> Generated by step 4.1 | |
# $ncbi --> User Input | |
# output: | |
votes="${outdir}/${simpleName}.taxids.tsv" | |
# script: | |
autometa-taxonomy-majority-vote --lca $lca --output $votes --dbdir $ncbi | |
# Step 4.3: Split assigned taxonomies into kingdoms | |
# input: | |
# $votes --> Generated by step 4.2 | |
# $outdir --> Generated by step 0 | |
# $ncbi --> User Input | |
# $assembly --> User Input | |
# output: | |
# Will write recovered superkingdoms to ${outdir} | |
# e.g. ${outdir}/${prefix}.bacteria.fna | |
# e.g. ${outdir}/${prefix}.archaea.fna | |
# e.g. ${outdir}/${prefix}.taxonomy.tsv | |
# script: | |
autometa-taxonomy \ | |
--votes $votes \ | |
--output $outdir \ | |
--prefix $simpleName \ | |
--split-rank-and-write superkingdom \ | |
--assembly $assembly \ | |
--ncbi $ncbi | |
# Step 5: Perform k-mer counting on respective kingdoms | |
# input: | |
# $kmer_size --> User input | |
# $norm_method --> User input | |
# $embed_method --> User input | |
# $embed_dimensions --> User input | |
# $cpus --> User input | |
# $seed --> User input | |
kingdoms=(bacteria archaea) | |
for kingdom in ${kingdoms[@]};do | |
# kingdom-specific input: | |
# NOTE: $fasta --> Generated by step 4.3 | |
fasta="${outdir}/${simpleName}.${kingdom}.fna" | |
# kingdom-specific output: | |
counts="${outdir}/${simpleName}.${kingdom}.${kmer_size}mers.tsv" | |
normalized="${outdir}/${simpleName}.${kingdom}.${kmer_size}mers.${norm_method}.tsv" | |
embedded="${outdir}/${simpleName}.${kingdom}.${kmer_size}mers.${norm_method}.${embed_method}.tsv" | |
# script: | |
autometa-kmers \ | |
--fasta $fasta \ | |
--kmers $counts \ | |
--size $kmer_size \ | |
--norm-output $normalized \ | |
--norm-method $norm_method \ | |
--pca-dimensions $pca_dimensions \ | |
--embedding-output $embedded \ | |
--embedding-method $embed_method \ | |
--embedding-dimensions $embed_dimensions \ | |
--cpus $cpus \ | |
--seed $seed | |
done | |
# Step 6: Perform binning on each set of bacterial and archaeal contigs | |
# input: | |
# $cpus --> User input | |
# $seed --> User input | |
taxonomy="${outdir}/${prefix}.taxonomy.tsv" # Generated by step 4.3 | |
# $gc_content --> Generated by step 1 | |
kingdoms=(bacteria archaea) | |
for kingdom in ${kingdoms[@]};do | |
# kingdom-specific input: | |
kmers="${outdir}/${simpleName}.${kingdom}.${kmer_size}mers.${norm_method}.${embed_method}.tsv" # Generated by step 5 | |
markers="${outdir}/${simpleName}.${kingdom}.markers.tsv" # Generated by step 3 | |
# kingdom-specific output: | |
output_binning="${outdir}/${simpleName}.${kingdom}.${cluster_method}.tsv" | |
output_main="${outdir}/${simpleName}.${kingdom}.${cluster_method}.main.tsv" | |
# script: | |
autometa-binning \ | |
--kmers $kmers \ | |
--coverages $coverage \ | |
--gc-content $gc_content \ | |
--markers $markers \ | |
--output-binning $output_binning \ | |
--output-main $output_main \ | |
--clustering-method $cluster_method \ | |
--completeness $completeness \ | |
--purity $purity \ | |
--cov-stddev-limit $cov_stddev_limit \ | |
--gc-stddev-limit $gc_stddev_limit \ | |
--taxonomy $taxonomy \ | |
--starting-rank superkingdom \ | |
--domain $kingdom | |
done | |
######### END ######### |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment