Skip to content

Instantly share code, notes, and snippets.

@evanroyrees
Created October 25, 2021 16:39
Show Gist options
  • Save evanroyrees/f0295617483324f3fd5e5c269d46e071 to your computer and use it in GitHub Desktop.
Save evanroyrees/f0295617483324f3fd5e5c269d46e071 to your computer and use it in GitHub Desktop.
Template slurm submission script to run autometa pipeline
#!/usr/bin/env bash
#SBATCH -p partition
#SBATCH -t 48:00:00
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=16
# NOTE: To create the conda environment for autometa you can supply the Makefile command:
# make create_environment
# Now activate the created conda env
# conda activate autometa
# NOTE: To install autometa in the created conda environment you can supply the Makefile command:
# make install
# Filepaths
assembly="Path to metagenome assembly fasta file"
bam="Path to metagenome read alignments.bam"
orfs="Path to orfs used as input to diamond blast"
blast="Path to diamond output file (outfmt 6)"
ncbi="Path to NCBI databases directory"
# Autometa Parameters
length_cutoff=3000 # in bp
# kmer counting, normalization transform and embedding method
kmer_size=5
norm_method="am_clr" # choices: am_clr, clr, ilr
pca_dimensions=50 # NOTE: must be greater than $embed_dimensions
embed_dimensions=2 # NOTE: must be less than $pca_dimensions
embed_method="bhsne" # choices: bhsne, sksne, umap
# Binning clustering method
cluster_method="hdbscan" # choices: hdbscan, dbscan
# Binning metrics cutoffs
completeness=20.0
purity=95.0
cov_stddev_limit=25.0
gc_stddev_limit=5.0
cpus=16
seed=42
# Step 0: Do some Path handling with the provided `assembly` filepath
simpleName="TemplateAssemblyName"
outdir="AutometaOutdir"
if [ ! -d $outdir ]
then mkdir -p $outdir
fi
######### BEGIN #########
# Step 1: filter assembly by length and retrieve contig lengths as well as GC content
# input:
# $assembly --> User input
# $length_cutoff --> User input
# output:
filtered_assembly="${outdir}/${simpleName}.filtered.fna"
gc_content="${outdir}/${simpleName}.gc_content.tsv"
# script:
autometa-length-filter \
--assembly $assembly \
--cutoff $length_cutoff \
--output-fasta $filtered_assembly \
--output-gc-content $gc_content
# Step 2: Determine coverages from assembly read alignments
# input:
# NOTE: $bam is defined at top and the rest of the inputs are generated by autometa
# output:
bed="${outdir}/${simpleName}.coverages.bed.tsv"
coverages="${outdir}/${simpleName}.coverages.tsv"
# script:
autometa-bedtools-genomecov --ibam $bam --bed $bed --output $coverages
# Step 3: Annotate and filter markers
# input:
# $orfs --> User input
# $cpus --> User input
# $seed --> User input
kingdoms=(bacteria archaea)
# NOTE: We iterate through both sets of markers for binning both bacterial and archeal kingdoms
for kingdom in ${kingdoms[@]};do
# kingdom-specific output:
hmmscan="${outdir}/${simpleName}.${kingdom}.hmmscan.tsv"
markers="${outdir}/${simpleName}.${kingdom}.markers.tsv"
# script:
autometa-markers \
--orfs $orfs \
--hmmscan $hmmscan \
--out $markers \
--kingdom $kingdom \
--parallel \
--cpus 4 \
--seed $seed
done
# Step 4.1: Determine ORF lowest common ancestor (LCA) amongst top hits
# input:
# $blast --> User Input
# $ncbi --> User Input
# output:
lca="${outdir}/${simpleName}.orfs.lca.tsv"
# script:
autometa-taxonomy-lca --blast $blast --dbdir $ncbi --output $lca
# Step 4.2: Perform Modified Majority vote of ORF LCAs for all contigs that returned hits in blast search
# input:
# $lca --> Generated by step 4.1
# $ncbi --> User Input
# output:
votes="${outdir}/${simpleName}.taxids.tsv"
# script:
autometa-taxonomy-majority-vote --lca $lca --output $votes --dbdir $ncbi
# Step 4.3: Split assigned taxonomies into kingdoms
# input:
# $votes --> Generated by step 4.2
# $outdir --> Generated by step 0
# $ncbi --> User Input
# $assembly --> User Input
# output:
# Will write recovered superkingdoms to ${outdir}
# e.g. ${outdir}/${prefix}.bacteria.fna
# e.g. ${outdir}/${prefix}.archaea.fna
# e.g. ${outdir}/${prefix}.taxonomy.tsv
# script:
autometa-taxonomy \
--votes $votes \
--output $outdir \
--prefix $simpleName \
--split-rank-and-write superkingdom \
--assembly $assembly \
--ncbi $ncbi
# Step 5: Perform k-mer counting on respective kingdoms
# input:
# $kmer_size --> User input
# $norm_method --> User input
# $embed_method --> User input
# $embed_dimensions --> User input
# $cpus --> User input
# $seed --> User input
kingdoms=(bacteria archaea)
for kingdom in ${kingdoms[@]};do
# kingdom-specific input:
# NOTE: $fasta --> Generated by step 4.3
fasta="${outdir}/${simpleName}.${kingdom}.fna"
# kingdom-specific output:
counts="${outdir}/${simpleName}.${kingdom}.${kmer_size}mers.tsv"
normalized="${outdir}/${simpleName}.${kingdom}.${kmer_size}mers.${norm_method}.tsv"
embedded="${outdir}/${simpleName}.${kingdom}.${kmer_size}mers.${norm_method}.${embed_method}.tsv"
# script:
autometa-kmers \
--fasta $fasta \
--kmers $counts \
--size $kmer_size \
--norm-output $normalized \
--norm-method $norm_method \
--pca-dimensions $pca_dimensions \
--embedding-output $embedded \
--embedding-method $embed_method \
--embedding-dimensions $embed_dimensions \
--cpus $cpus \
--seed $seed
done
# Step 6: Perform binning on each set of bacterial and archaeal contigs
# input:
# $cpus --> User input
# $seed --> User input
taxonomy="${outdir}/${prefix}.taxonomy.tsv" # Generated by step 4.3
# $gc_content --> Generated by step 1
kingdoms=(bacteria archaea)
for kingdom in ${kingdoms[@]};do
# kingdom-specific input:
kmers="${outdir}/${simpleName}.${kingdom}.${kmer_size}mers.${norm_method}.${embed_method}.tsv" # Generated by step 5
markers="${outdir}/${simpleName}.${kingdom}.markers.tsv" # Generated by step 3
# kingdom-specific output:
output_binning="${outdir}/${simpleName}.${kingdom}.${cluster_method}.tsv"
output_main="${outdir}/${simpleName}.${kingdom}.${cluster_method}.main.tsv"
# script:
autometa-binning \
--kmers $kmers \
--coverages $coverage \
--gc-content $gc_content \
--markers $markers \
--output-binning $output_binning \
--output-main $output_main \
--clustering-method $cluster_method \
--completeness $completeness \
--purity $purity \
--cov-stddev-limit $cov_stddev_limit \
--gc-stddev-limit $gc_stddev_limit \
--taxonomy $taxonomy \
--starting-rank superkingdom \
--domain $kingdom
done
######### END #########
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment