Skip to content

Instantly share code, notes, and snippets.

@Sidduppal
Last active September 10, 2023 23:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Sidduppal/4c252d22b5a6a49d441dcb7d5eab2568 to your computer and use it in GitHub Desktop.
Save Sidduppal/4c252d22b5a6a49d441dcb7d5eab2568 to your computer and use it in GitHub Desktop.
snakemake workflow for running [Autometa](https://github.com/KwanLab/Autometa)
"""
Author: Siddharth Uppal (@Sidduppal)
Date: 2023-Sept-09
Version: 1.0
Description: snakemake workflow for running [Autometa](https://github.com/KwanLab/Autometa)
"""
import glob
import os
# Dataset prefix. Alter them as per your need.
datasets = [
"dataset_1",
"dataset_2",
]
kingdoms = ["bacteria"] # Choices "bacteria", "archaea". If you change the taxonomy here, you also need to alter `rule taxonomy` as needed
# IN_DIR is the path to the directory where all assembly files are.
# Change it as per your needs
IN_DIR = "<Path/to/common/directory/for/all/assemblies>"
OUT_DIR = "<Path/to/output/directory>"
READ_DIR = "<Path/to/reads/directory>"
# Databases
MARKERS_DB = "path/to/markers/db" # Make sure they are hmm pressed
NCBI_DB = "path/to/NCBI/database/directory" # For more info see: https://autometa.readthedocs.io/en/latest/databases.html#ncbi
NR_DB = "path/to/diamond/formatted/nr/database" # For more info see: https://autometa.readthedocs.io/en/latest/databases.html#ncbi
# This is the final output we need
rule all:
input:
expand(
OUT_DIR + "/{dataset}/{dataset}.ncbi.{kingdom}_metabin_taxonomy.tsv",
kingdom=kingdoms,
dataset=datasets,
),
rule lengthFilter:
input:
assembly=IN_DIR + "/{dataset}/{dataset}.contigs.fa",
output:
filteredAssembly=OUT_DIR + "/{dataset}/{dataset}.filtered.fna",
gc_content=OUT_DIR + "/{dataset}/{dataset}.gc_content.tsv",
params:
lengthCutoff=3000, # in bp
threads: 1
shell:
"autometa-length-filter --assembly {input.assembly} "
"--cutoff {params.lengthCutoff} "
"--output-fasta {output.filteredAssembly} "
"--output-gc-content {output.gc_content} "
"--verbose"
rule orfs:
input:
filteredAssembly=OUT_DIR + "/{dataset}/{dataset}.filtered.fna",
output:
faa=OUT_DIR + "/{dataset}/{dataset}.filtered.faa",
ffn=OUT_DIR + "/{dataset}/{dataset}.filtered.ffn",
gbk=OUT_DIR + "/{dataset}/{dataset}.filtered.gbk",
threads: 1
shell:
"prodigal -i {input.filteredAssembly} "
"-d {output.ffn} -a {output.faa} -f gbk -o {output.gbk} -p meta -m"
rule markers:
input:
orfs=OUT_DIR + "/{dataset}/{dataset}.filtered.faa",
output:
markers=OUT_DIR + "/{dataset}/{dataset}.{kingdom}.markers.tsv",
hmmscan=OUT_DIR + "/{dataset}/{dataset}.{kingdom}.hmmscan.tsv",
threads: 4
params:
seed=42,
db=MARKERS_DB, # make sure that the markers are hmm pressed!
shell:
"autometa-markers --orfs {input.orfs} "
"--hmmscan {output.hmmscan} "
"--out {output.markers} --dbdir {params.db} "
"--kingdom {wildcards.kingdom} --parallel --cpus {threads} --seed {params.seed} "
# These function can't be used in case of multiple libraries for a single dataset.
# Change the read extension as per your need
def get_read_1(dataset):
dir_path = os.path.join(READ_DIR, str(dataset), "*_1.fq.gz")
files = glob.glob(dir_path)
return files
def get_read_2(dataset):
dir_path = os.path.join(READ_DIR, str(dataset), "*_2.fq.gz")
files = glob.glob(dir_path)
return files
# This rule should be altered in case of SPAdes assembly
rule coverage:
input:
read_1=get_read_1,
read_2=get_read_2,
assembly=IN_DIR + "/{dataset}/{dataset}.contigs.fa",
output:
cov=OUT_DIR + "/{dataset}/{dataset}.coverages.tsv",
threads: 16
shell:
"autometa-coverage --assembly {input.assembly} "
"--fwd-reads {input.read_1} --rev-reads {input.read_2} "
"--out {output.cov} "
"--cpus {threads}"
rule blast_ncbi:
input:
orfs=OUT_DIR + "/{dataset}/{dataset}.filtered.faa",
output:
blast=OUT_DIR + "/{dataset}/{dataset}.blastp.tsv",
threads: 16
params:
ncbi=NR_DB,
shell:
"diamond blastp --query {input.orfs} --db {params.ncbi} --evalue 1e-5 --max-target-seqs 200 --threads {threads} "
"--outfmt 6 --out {output.blast} "
rule taxonomyLca:
input:
blast=OUT_DIR + "/{dataset}/{dataset}.blastp.tsv",
output:
lca=OUT_DIR + "/{dataset}/{dataset}.ncbi.orfs.lca.tsv",
sseqid_to_taxid=OUT_DIR + "/{dataset}/{dataset}.ncbi.orfs.sseqid2taxid.tsv",
error_taxids=OUT_DIR + "/{dataset}/{dataset}.ncbi.orfs.errortaxids.tsv",
threads: 1
params:
ncbi=NCBI_DB,
dbtype="ncbi",
shell:
"autometa-taxonomy-lca --blast {input.blast} --dbdir {params.ncbi} --dbtype {params.dbtype} "
"--lca-output {output.lca} --sseqid2taxid-output {output.sseqid_to_taxid} --lca-error-taxids {output.error_taxids} "
"--verbose"
rule taxonomyVote:
input:
lca=OUT_DIR + "/{dataset}/{dataset}.ncbi.orfs.lca.tsv",
output:
vote=OUT_DIR + "/{dataset}/{dataset}.ncbi.taxids.tsv",
threads: 1
params:
ncbi=NCBI_DB,
dbtype="ncbi",
shell:
"autometa-taxonomy-majority-vote --lca {input.lca} --dbdir {params.ncbi} --dbtype {params.dbtype} "
"--output {output.vote} --verbose"
rule taxonomy:
input:
vote=OUT_DIR + "/{dataset}/{dataset}.ncbi.taxids.tsv",
assembly=OUT_DIR + "/{dataset}/{dataset}.filtered.fna",
output:
taxonomy=OUT_DIR + "/{dataset}/{dataset}.ncbi.taxonomy.tsv",
# Uncomment the the taxa you are interested in
fasta_bac=OUT_DIR + "/{dataset}/{dataset}.ncbi.bacteria.fna",
# fasta_archae=OUT_DIR + "/{dataset}/{dataset}.ncbi.archaea.fna",
threads: 1
params:
ncbi=NCBI_DB,
dbtype="ncbi",
split_rank="superkingdom",
outdir=directory(OUT_DIR + "/{dataset}"),
shell:
"autometa-taxonomy --votes {input.vote} --dbdir {params.ncbi} --dbtype {params.dbtype} "
"--assembly {input.assembly} --prefix {wildcards.dataset}.ncbi --split-rank-and-write {params.split_rank} "
"--output {params.outdir}"
rule kmers:
input:
fasta=OUT_DIR + "/{dataset}/{dataset}.ncbi.{kingdom}.fna",
output:
counts=OUT_DIR + "/{dataset}/{dataset}.ncbi.{kingdom}.5mers.tsv",
normalized=OUT_DIR + "/{dataset}/{dataset}.ncbi.{kingdom}.5mers.am_clr.tsv",
embedded=OUT_DIR + "/{dataset}/{dataset}.ncbi.{kingdom}.5mers.am_clr.bhsne.tsv",
threads: 8
params:
norm_method="am_clr", # choices: am_clr, clr, ilr
embed_method="bhsne", # choices: bhsne, sksne, umap, densmap, trimap
seed=42,
pca_dim=50,
shell:
"autometa-kmers --fasta {input.fasta} --kmers {output.counts} --size 5 "
"--norm-output {output.normalized} --norm-method {params.norm_method} --pca-dimensions {params.pca_dim} "
"--embedding-output {output.embedded} --embedding-method {params.embed_method} --embedding-dimensions 2 "
"--cpus {threads} --seed {params.seed}"
rule binning:
input:
kmer=OUT_DIR + "/{dataset}/{dataset}.ncbi.{kingdom}.5mers.am_clr.bhsne.tsv",
coverage=OUT_DIR + "/{dataset}/{dataset}.coverages.tsv",
gc_content=OUT_DIR + "/{dataset}/{dataset}.gc_content.tsv",
markers=OUT_DIR + "/{dataset}/{dataset}.{kingdom}.markers.tsv",
taxonomy=OUT_DIR + "/{dataset}/{dataset}.ncbi.taxonomy.tsv",
output:
binning=OUT_DIR + "/{dataset}/{dataset}.binning.ncbi.{kingdom}.hdbscan.tsv",
binningMain=OUT_DIR
+ "/{dataset}/{dataset}.binning.ncbi.{kingdom}.hdbscan.main.tsv",
threads: 16
params:
cluster_method="hdbscan", # choices: hdbscan, dbscan
completeness=20, # Accept MAGs greater than this value
purity=90, # Accept MAGs greater than this value
cov_stddev_limit=25, # Accept MAGs less than this value
gc_stddev_limit=5, # Accept MAGs less than this value
start_rank="superkingdom", # Canonical rank at which to begin subsetting taxonomy
rank_filter="superkingdom", # Taxonomy column canonical rank to subset by provided value of `--rank-name-filter`
shell:
"autometa-binning --kmers {input.kmer} --coverages {input.coverage} --gc-content {input.gc_content} "
"--markers {input.markers} --output-binning {output.binning} --output-main {output.binningMain} "
"--taxonomy {input.taxonomy} --completeness {params.completeness} --purity {params.purity} "
"--cov-stddev-limit {params.cov_stddev_limit} --gc-stddev-limit {params.gc_stddev_limit} "
"--starting-rank {params.start_rank} --rank-filter {params.rank_filter} --rank-name-filter {wildcards.kingdom} --cpus {threads}"
rule binningSummary:
input:
binningMain=OUT_DIR
+ "/{dataset}/{dataset}.binning.ncbi.{kingdom}.hdbscan.main.tsv",
markers=OUT_DIR + "/{dataset}/{dataset}.{kingdom}.markers.tsv",
assembly=OUT_DIR + "/{dataset}/{dataset}.filtered.fna",
output:
output_stats=OUT_DIR
+ "/{dataset}/{dataset}.ncbi.{kingdom}_metabin_stats.tsv.tsv",
output_taxonomy=OUT_DIR
+ "/{dataset}/{dataset}.ncbi.{kingdom}_metabin_taxonomy.tsv",
metabins=directory(
OUT_DIR + "/{dataset}/{dataset}.ncbi.{kingdom}.metabins",
),
threads: 1
params:
ncbi=NCBI_DB,
dbtype="ncbi",
shell:
"autometa-binning-summary --binning-main {input.binningMain} --markers {input.markers} "
"--metagenome {input.assembly} --dbdir {params.ncbi} --dbtype {params.dbtype} "
"--output-stats {output.output_stats} --output-taxonomy {output.output_taxonomy} "
"--output-metabins {output.metabins}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment