Last active
September 10, 2023 23:20
-
-
Save Sidduppal/4c252d22b5a6a49d441dcb7d5eab2568 to your computer and use it in GitHub Desktop.
snakemake workflow for running [Autometa](https://github.com/KwanLab/Autometa)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Author: Siddharth Uppal (@Sidduppal) | |
Date: 2023-Sept-09 | |
Version: 1.0 | |
Description: snakemake workflow for running [Autometa](https://github.com/KwanLab/Autometa) | |
""" | |
import glob | |
import os | |
# Dataset prefix. Alter them as per your need. | |
datasets = [ | |
"dataset_1", | |
"dataset_2", | |
] | |
kingdoms = ["bacteria"] # Choices "bacteria", "archaea". If you change the taxonomy here, you also need to alter `rule taxonomy` as needed | |
# IN_DIR is the path to the directory where all assembly files are. | |
# Change it as per your needs | |
IN_DIR = "<Path/to/common/directory/for/all/assemblies>" | |
OUT_DIR = "<Path/to/output/directory>" | |
READ_DIR = "<Path/to/reads/directory>" | |
# Databases | |
MARKERS_DB = "path/to/markers/db" # Make sure they are hmm pressed | |
NCBI_DB = "path/to/NCBI/database/directory" # For more info see: https://autometa.readthedocs.io/en/latest/databases.html#ncbi | |
NR_DB = "path/to/diamond/formatted/nr/database" # For more info see: https://autometa.readthedocs.io/en/latest/databases.html#ncbi | |
# This is the final output we need | |
rule all: | |
input: | |
expand( | |
OUT_DIR + "/{dataset}/{dataset}.ncbi.{kingdom}_metabin_taxonomy.tsv", | |
kingdom=kingdoms, | |
dataset=datasets, | |
), | |
rule lengthFilter: | |
input: | |
assembly=IN_DIR + "/{dataset}/{dataset}.contigs.fa", | |
output: | |
filteredAssembly=OUT_DIR + "/{dataset}/{dataset}.filtered.fna", | |
gc_content=OUT_DIR + "/{dataset}/{dataset}.gc_content.tsv", | |
params: | |
lengthCutoff=3000, # in bp | |
threads: 1 | |
shell: | |
"autometa-length-filter --assembly {input.assembly} " | |
"--cutoff {params.lengthCutoff} " | |
"--output-fasta {output.filteredAssembly} " | |
"--output-gc-content {output.gc_content} " | |
"--verbose" | |
rule orfs: | |
input: | |
filteredAssembly=OUT_DIR + "/{dataset}/{dataset}.filtered.fna", | |
output: | |
faa=OUT_DIR + "/{dataset}/{dataset}.filtered.faa", | |
ffn=OUT_DIR + "/{dataset}/{dataset}.filtered.ffn", | |
gbk=OUT_DIR + "/{dataset}/{dataset}.filtered.gbk", | |
threads: 1 | |
shell: | |
"prodigal -i {input.filteredAssembly} " | |
"-d {output.ffn} -a {output.faa} -f gbk -o {output.gbk} -p meta -m" | |
rule markers: | |
input: | |
orfs=OUT_DIR + "/{dataset}/{dataset}.filtered.faa", | |
output: | |
markers=OUT_DIR + "/{dataset}/{dataset}.{kingdom}.markers.tsv", | |
hmmscan=OUT_DIR + "/{dataset}/{dataset}.{kingdom}.hmmscan.tsv", | |
threads: 4 | |
params: | |
seed=42, | |
db=MARKERS_DB, # make sure that the markers are hmm pressed! | |
shell: | |
"autometa-markers --orfs {input.orfs} " | |
"--hmmscan {output.hmmscan} " | |
"--out {output.markers} --dbdir {params.db} " | |
"--kingdom {wildcards.kingdom} --parallel --cpus {threads} --seed {params.seed} " | |
# These function can't be used in case of multiple libraries for a single dataset. | |
# Change the read extension as per your need | |
def get_read_1(dataset): | |
dir_path = os.path.join(READ_DIR, str(dataset), "*_1.fq.gz") | |
files = glob.glob(dir_path) | |
return files | |
def get_read_2(dataset): | |
dir_path = os.path.join(READ_DIR, str(dataset), "*_2.fq.gz") | |
files = glob.glob(dir_path) | |
return files | |
# This rule should be altered in case of SPAdes assembly | |
rule coverage: | |
input: | |
read_1=get_read_1, | |
read_2=get_read_2, | |
assembly=IN_DIR + "/{dataset}/{dataset}.contigs.fa", | |
output: | |
cov=OUT_DIR + "/{dataset}/{dataset}.coverages.tsv", | |
threads: 16 | |
shell: | |
"autometa-coverage --assembly {input.assembly} " | |
"--fwd-reads {input.read_1} --rev-reads {input.read_2} " | |
"--out {output.cov} " | |
"--cpus {threads}" | |
rule blast_ncbi: | |
input: | |
orfs=OUT_DIR + "/{dataset}/{dataset}.filtered.faa", | |
output: | |
blast=OUT_DIR + "/{dataset}/{dataset}.blastp.tsv", | |
threads: 16 | |
params: | |
ncbi=NR_DB, | |
shell: | |
"diamond blastp --query {input.orfs} --db {params.ncbi} --evalue 1e-5 --max-target-seqs 200 --threads {threads} " | |
"--outfmt 6 --out {output.blast} " | |
rule taxonomyLca: | |
input: | |
blast=OUT_DIR + "/{dataset}/{dataset}.blastp.tsv", | |
output: | |
lca=OUT_DIR + "/{dataset}/{dataset}.ncbi.orfs.lca.tsv", | |
sseqid_to_taxid=OUT_DIR + "/{dataset}/{dataset}.ncbi.orfs.sseqid2taxid.tsv", | |
error_taxids=OUT_DIR + "/{dataset}/{dataset}.ncbi.orfs.errortaxids.tsv", | |
threads: 1 | |
params: | |
ncbi=NCBI_DB, | |
dbtype="ncbi", | |
shell: | |
"autometa-taxonomy-lca --blast {input.blast} --dbdir {params.ncbi} --dbtype {params.dbtype} " | |
"--lca-output {output.lca} --sseqid2taxid-output {output.sseqid_to_taxid} --lca-error-taxids {output.error_taxids} " | |
"--verbose" | |
rule taxonomyVote: | |
input: | |
lca=OUT_DIR + "/{dataset}/{dataset}.ncbi.orfs.lca.tsv", | |
output: | |
vote=OUT_DIR + "/{dataset}/{dataset}.ncbi.taxids.tsv", | |
threads: 1 | |
params: | |
ncbi=NCBI_DB, | |
dbtype="ncbi", | |
shell: | |
"autometa-taxonomy-majority-vote --lca {input.lca} --dbdir {params.ncbi} --dbtype {params.dbtype} " | |
"--output {output.vote} --verbose" | |
rule taxonomy: | |
input: | |
vote=OUT_DIR + "/{dataset}/{dataset}.ncbi.taxids.tsv", | |
assembly=OUT_DIR + "/{dataset}/{dataset}.filtered.fna", | |
output: | |
taxonomy=OUT_DIR + "/{dataset}/{dataset}.ncbi.taxonomy.tsv", | |
# Uncomment the the taxa you are interested in | |
fasta_bac=OUT_DIR + "/{dataset}/{dataset}.ncbi.bacteria.fna", | |
# fasta_archae=OUT_DIR + "/{dataset}/{dataset}.ncbi.archaea.fna", | |
threads: 1 | |
params: | |
ncbi=NCBI_DB, | |
dbtype="ncbi", | |
split_rank="superkingdom", | |
outdir=directory(OUT_DIR + "/{dataset}"), | |
shell: | |
"autometa-taxonomy --votes {input.vote} --dbdir {params.ncbi} --dbtype {params.dbtype} " | |
"--assembly {input.assembly} --prefix {wildcards.dataset}.ncbi --split-rank-and-write {params.split_rank} " | |
"--output {params.outdir}" | |
rule kmers: | |
input: | |
fasta=OUT_DIR + "/{dataset}/{dataset}.ncbi.{kingdom}.fna", | |
output: | |
counts=OUT_DIR + "/{dataset}/{dataset}.ncbi.{kingdom}.5mers.tsv", | |
normalized=OUT_DIR + "/{dataset}/{dataset}.ncbi.{kingdom}.5mers.am_clr.tsv", | |
embedded=OUT_DIR + "/{dataset}/{dataset}.ncbi.{kingdom}.5mers.am_clr.bhsne.tsv", | |
threads: 8 | |
params: | |
norm_method="am_clr", # choices: am_clr, clr, ilr | |
embed_method="bhsne", # choices: bhsne, sksne, umap, densmap, trimap | |
seed=42, | |
pca_dim=50, | |
shell: | |
"autometa-kmers --fasta {input.fasta} --kmers {output.counts} --size 5 " | |
"--norm-output {output.normalized} --norm-method {params.norm_method} --pca-dimensions {params.pca_dim} " | |
"--embedding-output {output.embedded} --embedding-method {params.embed_method} --embedding-dimensions 2 " | |
"--cpus {threads} --seed {params.seed}" | |
rule binning: | |
input: | |
kmer=OUT_DIR + "/{dataset}/{dataset}.ncbi.{kingdom}.5mers.am_clr.bhsne.tsv", | |
coverage=OUT_DIR + "/{dataset}/{dataset}.coverages.tsv", | |
gc_content=OUT_DIR + "/{dataset}/{dataset}.gc_content.tsv", | |
markers=OUT_DIR + "/{dataset}/{dataset}.{kingdom}.markers.tsv", | |
taxonomy=OUT_DIR + "/{dataset}/{dataset}.ncbi.taxonomy.tsv", | |
output: | |
binning=OUT_DIR + "/{dataset}/{dataset}.binning.ncbi.{kingdom}.hdbscan.tsv", | |
binningMain=OUT_DIR | |
+ "/{dataset}/{dataset}.binning.ncbi.{kingdom}.hdbscan.main.tsv", | |
threads: 16 | |
params: | |
cluster_method="hdbscan", # choices: hdbscan, dbscan | |
completeness=20, # Accept MAGs greater than this value | |
purity=90, # Accept MAGs greater than this value | |
cov_stddev_limit=25, # Accept MAGs less than this value | |
gc_stddev_limit=5, # Accept MAGs less than this value | |
start_rank="superkingdom", # Canonical rank at which to begin subsetting taxonomy | |
rank_filter="superkingdom", # Taxonomy column canonical rank to subset by provided value of `--rank-name-filter` | |
shell: | |
"autometa-binning --kmers {input.kmer} --coverages {input.coverage} --gc-content {input.gc_content} " | |
"--markers {input.markers} --output-binning {output.binning} --output-main {output.binningMain} " | |
"--taxonomy {input.taxonomy} --completeness {params.completeness} --purity {params.purity} " | |
"--cov-stddev-limit {params.cov_stddev_limit} --gc-stddev-limit {params.gc_stddev_limit} " | |
"--starting-rank {params.start_rank} --rank-filter {params.rank_filter} --rank-name-filter {wildcards.kingdom} --cpus {threads}" | |
rule binningSummary: | |
input: | |
binningMain=OUT_DIR | |
+ "/{dataset}/{dataset}.binning.ncbi.{kingdom}.hdbscan.main.tsv", | |
markers=OUT_DIR + "/{dataset}/{dataset}.{kingdom}.markers.tsv", | |
assembly=OUT_DIR + "/{dataset}/{dataset}.filtered.fna", | |
output: | |
output_stats=OUT_DIR | |
+ "/{dataset}/{dataset}.ncbi.{kingdom}_metabin_stats.tsv.tsv", | |
output_taxonomy=OUT_DIR | |
+ "/{dataset}/{dataset}.ncbi.{kingdom}_metabin_taxonomy.tsv", | |
metabins=directory( | |
OUT_DIR + "/{dataset}/{dataset}.ncbi.{kingdom}.metabins", | |
), | |
threads: 1 | |
params: | |
ncbi=NCBI_DB, | |
dbtype="ncbi", | |
shell: | |
"autometa-binning-summary --binning-main {input.binningMain} --markers {input.markers} " | |
"--metagenome {input.assembly} --dbdir {params.ncbi} --dbtype {params.dbtype} " | |
"--output-stats {output.output_stats} --output-taxonomy {output.output_taxonomy} " | |
"--output-metabins {output.metabins}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment