Last active
November 12, 2021 00:10
-
-
Save evanroyrees/d28e43a3da36616540ee227ad9f1947c to your computer and use it in GitHub Desktop.
Template slurm submission script to run autometa-large-data-mode pipeline
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
#SBATCH -p partition | |
#SBATCH -t 48:00:00 | |
#SBATCH --nodes=1 | |
#SBATCH --ntasks-per-node=16 | |
## First create environment to run Autometa, (and optionally GTDB-Tk and CheckM) | |
# git clone git@github.com:KwanLab/Autometa | |
# cd Autometa | |
# make create_environment | |
# conda activate autometa | |
## Install autometa from source | |
## NOTE: For list of available commands try `make` in the Autometa directory | |
# cd Autometa | |
# make install | |
# hmmpress -f autometa/databases/markers/bacteria.single_copy.hmm | |
# hmmpress -f autometa/databases/markers/archaea.single_copy.hmm | |
## Install GTDB-Tk for post-processing autometa bins | |
## For more info see: https://ecogenomics.github.io/GTDBTk/ | |
## Install CheckM for post-processing autometa bins | |
## For more info see: https://github.com/Ecogenomics/CheckM/wiki/Installation#how-to-install-checkm | |
# Filepaths | |
assembly="Path to metagenome assembly fasta file" | |
bam="Path to metagenome read alignments.bam" | |
orfs="Path to orfs used as input to diamond blastp" | |
blast="Path to diamond blastp results (outfmt 6)" | |
ncbi="Path to NCBI databases directory" # should contain prot.accession2taxid.gz, names.dmp, nodes.dmp and merged.dmp | |
markers_dbdir="Path to autometa markers databases" # should contain marker hmms and cutoffs files (can be downloaded from Autometa repo under `Autometa/autometa/databases/markers`) | |
# Autometa Parameters | |
length_cutoff=3000 # in bp | |
# kmer counting, normalization transform and embedding method | |
kmer_size=5 | |
norm_method="am_clr" # choices: am_clr, clr, ilr | |
pca_dimensions=50 # NOTE: must be greater than $embed_dimensions | |
embed_dimensions=2 # NOTE: must be less than $pca_dimensions | |
embed_method="umap" # choices: bhsne, sksne, umap, densne, trimap | |
# Binning clustering method | |
clustering_method="hdbscan" # choices: hdbscan, dbscan | |
# Binning metrics cutoffs | |
completeness=20.0 | |
purity=95.0 | |
cov_stddev_limit=25.0 | |
gc_stddev_limit=5.0 | |
max_partition_size=10000 | |
cpus=16 | |
seed=42 | |
# Step 0: Do some path handling with the provided `assembly` filepath | |
simpleName="TemplateAssemblyName" | |
outdir="AutometaOutdir" | |
if [ ! -d $outdir ] | |
then mkdir -p $outdir | |
fi | |
######### BEGIN ######### | |
# Step 1: filter assembly by length and retrieve contig lengths as well as GC content | |
# input: | |
# $assembly --> User input | |
# $length_cutoff --> User input | |
# output: | |
filtered_assembly="${outdir}/${simpleName}.filtered.fna" | |
gc_content="${outdir}/${simpleName}.gc_content.tsv" | |
# script: | |
autometa-length-filter \ | |
--assembly $assembly \ | |
--cutoff $length_cutoff \ | |
--output-fasta $filtered_assembly \ | |
--output-gc-content $gc_content | |
# Step 2: Determine coverages from assembly read alignments | |
# input: | |
# NOTE: $bam is defined at top and the rest of the inputs are generated by autometa | |
# output: | |
bed="${outdir}/${simpleName}.coverages.bed.tsv" | |
coverages="${outdir}/${simpleName}.coverages.tsv" | |
# script: | |
autometa-bedtools-genomecov --ibam $bam --bed $bed --output $coverages | |
# Step 3: Annotate and filter markers | |
# input: | |
# $orfs --> User input | |
# $cpus --> User input | |
# $seed --> User input | |
kingdoms=(bacteria archaea) | |
# NOTE: We iterate through both sets of markers for binning both bacterial and archeal kingdoms | |
for kingdom in ${kingdoms[@]};do | |
# kingdom-specific output: | |
hmmscan="${outdir}/${simpleName}.${kingdom}.hmmscan.tsv" | |
markers="${outdir}/${simpleName}.${kingdom}.markers.tsv" | |
# script: | |
autometa-markers \ | |
--orfs $orfs \ | |
--hmmscan $hmmscan \ | |
--dbdir $markers_dbdir \ | |
--out $markers \ | |
--kingdom $kingdom \ | |
--parallel \ | |
--cpus 4 \ | |
--seed $seed | |
done | |
# Step 4.1: Determine ORF lowest common ancestor (LCA) amongst top hits | |
# input: | |
# $blast --> User Input | |
# $ncbi --> User Input | |
# output: | |
lca="${outdir}/${simpleName}.orfs.lca.tsv" | |
# script: | |
autometa-taxonomy-lca --blast $blast --dbdir $ncbi --output $lca | |
# Step 4.2: Perform Modified Majority vote of ORF LCAs for all contigs that returned hits in blast search | |
# input: | |
# $lca --> Generated by step 4.1 | |
# $ncbi --> User Input | |
# output: | |
votes="${outdir}/${simpleName}.taxids.tsv" | |
# script: | |
autometa-taxonomy-majority-vote --lca $lca --output $votes --dbdir $ncbi | |
# Step 4.3: Split assigned taxonomies into kingdoms | |
# input: | |
# $votes --> Generated by step 4.2 | |
# $outdir --> Generated by step 0 | |
# $ncbi --> User Input | |
# $assembly --> User Input | |
# output: | |
# Will write recovered superkingdoms to ${outdir} | |
# e.g. ${outdir}/${prefix}.bacteria.fna | |
# e.g. ${outdir}/${prefix}.archaea.fna | |
# e.g. ${outdir}/${prefix}.taxonomy.tsv | |
# script: | |
autometa-taxonomy \ | |
--votes $votes \ | |
--output $outdir \ | |
--prefix $simpleName \ | |
--split-rank-and-write superkingdom \ | |
--assembly $assembly \ | |
--ncbi $ncbi | |
# Step 5: Perform k-mer counting on respective kingdoms | |
# input: | |
# $kmer_size --> User input | |
# $norm_method --> User input | |
# $embed_method --> User input | |
# $embed_dimensions --> User input | |
# $cpus --> User input | |
# $seed --> User input | |
kingdoms=(bacteria archaea) | |
for kingdom in ${kingdoms[@]};do | |
# kingdom-specific input: | |
# NOTE: $fasta --> Generated by step 4.3 | |
fasta="${outdir}/${simpleName}.${kingdom}.fna" | |
if [ ! -f $fasta ] | |
then | |
echo "${fasta} does not exist, skipping..." | |
continue | |
fi | |
# kingdom-specific output: | |
counts="${outdir}/${simpleName}.${kingdom}.${kmer_size}mers.tsv" | |
# script: | |
autometa-kmers \ | |
--fasta $fasta \ | |
--kmers $counts \ | |
--size $kmer_size \ | |
--cpus $cpus | |
done | |
# Step 6: Perform binning on each set of bacterial and archaeal contigs | |
# input: | |
# $cpus --> User input | |
# $seed --> User input | |
taxonomy="${outdir}/${simpleName}.taxonomy.tsv" # Generated by step 4.3 | |
# $gc_content --> Generated by step 1 | |
kingdoms=(bacteria archaea) | |
for kingdom in ${kingdoms[@]};do | |
# kingdom-specific input: | |
kmers="${outdir}/${simpleName}.${kingdom}.${kmer_size}mers.tsv" # Generated by step 5 (counts) | |
markers="${outdir}/${simpleName}.${kingdom}.markers.tsv" # Generated by step 3 | |
# kingdom-specific output: | |
cache="${outdir}/${simpleName}_${kingdom}_cache" | |
output_binning="${outdir}/${simpleName}.${kingdom}.${clustering_method}.tsv" | |
output_main="${outdir}/${simpleName}.${kingdom}.${clustering_method}.main.tsv" | |
if [ ! -f $kmers ] | |
then | |
echo "${kmers} does not exist, skipping..." | |
continue | |
fi | |
# script: | |
autometa-large-data-mode-binning \ | |
--kmers $kmers \ | |
--coverages $coverages \ | |
--gc-content $gc_content \ | |
--markers $markers \ | |
--taxonomy $taxonomy \ | |
--output-binning $output_binning \ | |
--output-main $output_main \ | |
--clustering-method $clustering_method \ | |
--completeness $completeness \ | |
--purity $purity \ | |
--cov-stddev-limit $cov_stddev_limit \ | |
--gc-stddev-limit $gc_stddev_limit \ | |
--norm-method $norm_method \ | |
--pca-dims $pca_dimensions \ | |
--embed-method $embed_method \ | |
--embed-dims $embed_dimensions \ | |
--max-partition-size $max_partition_size \ | |
--starting-rank superkingdom \ | |
--cache $cache \ | |
--rank-filter superkingdom \ | |
--rank-name-filter $kingdom | |
done | |
# Step 7: Create binning summary files | |
# input: | |
# $ncbi -> User input | |
# $assembly -> User input | |
kingdoms=(bacteria archaea) | |
for kingdom in ${kingdoms[@]};do | |
# kingdom-specific input: | |
binning_main="${outdir}/${simpleName}.${kingdom}.${clustering_method}.main.tsv" # Generated by step 6 | |
markers="${outdir}/${simpleName}.${kingdom}.markers.tsv" # Generated by step 3 | |
# kingdom-specific output: | |
output_stats="${outdir}/${simpleName}_${kingdom}_metabin_stats.tsv" | |
output_taxonomy="${outdir}/${simpleName}_${kingdom}_metabin_taxonomy.tsv" | |
output_metabins="${outdir}/${simpleName}_${kingdom}_metabins" | |
if [ ! -f $binning_main ] | |
then | |
echo "${binning_main} does not exist, skipping..." | |
continue | |
fi | |
# script: | |
autometa-binning-summary \ | |
--binning-main $binning_main \ | |
--markers $markers \ | |
--metagenome $assembly \ | |
--ncbi $ncbi \ | |
--output-stats $output_stats \ | |
--output-taxonomy $output_taxonomy \ | |
--output-metabins $output_metabins | |
done | |
# Step 8: (OPTIONAL) Annotate Autometa bins' taxonomies and completeness/contamination metrics using GTDB-Tk & CheckM | |
# input: | |
# $output_metabins --> Generated by step 7 | |
# kingdoms=(bacteria archaea) | |
# for kingdom in ${kingdoms[@]};do | |
# # kingdom-specific input | |
# output_metabins="${outdir}/${simpleName}_${kingdom}_metabins" | |
# # kingdom-specific outputs | |
# gtdbtk_outdir="${outdir}/${simpleName}_${kingdom}_gtdbtk_classify_wf" | |
# checkm_outdir="${outdir}/${simpleName}_${kingdom}_checkm_lineage_wf" | |
# checkm_stats="${outdir}/${simpleName}_${kingdom}_checkm_stats.tsv" | |
# if [ ! -d $output_metabins ] | |
# then | |
# echo "${output_metabins} does not exist, skipping..." | |
# continue | |
# fi | |
# # NOTE: autometa-binning-summary writes out all bins with the <cluster>.fna extension | |
# # Unclustered sequences are written to one file with the .fasta extension. e.g. outdir/unclustered.fasta | |
# # First run `gtdbtk check_install` to ensure the following command will run | |
# gtdbtk classify_wf \ | |
# --genome_dir $output_metabins \ | |
# --extension fna \ | |
# --out_dir $gtdbtk_outdir \ | |
# --cpus $cpus \ | |
# --scratch_dir $LOCAL \ | |
# --pplacer_cpus 1 | |
# checkm lineage_wf \ | |
# --extension fna \ | |
# --threads $cpus \ | |
# --tmpdir $LOCAL \ | |
# --pplacer_threads 1 \ | |
# --tab_table \ | |
# --file $checkm_stats \ | |
# $output_metabins \ | |
# $checkm_outdir | |
# done | |
######### END ######### |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment