Skip to content

Instantly share code, notes, and snippets.

@elsherbini
Created September 19, 2016 20:12
Show Gist options
  • Save elsherbini/a165a0760912c78d9adcc737215c6077 to your computer and use it in GitHub Desktop.
Save elsherbini/a165a0760912c78d9adcc737215c6077 to your computer and use it in GitHub Desktop.
dada2_snakefile
configfile:
"config.yaml"
from snakemake.utils import R
import glob
import os
import numpy as np
SETS = ["02_frac", "5_frac", "1_and_63_frac", "unfrac"]
SET_TO_SAMPLES = {s: [os.path.splitext(os.path.basename(fn))[0].split(".")[0] for fn in os.listdir("data/{}/1.cat".format(s))] for s in SETS}
INPUT = [{"set": set_name, "sample": sample, "direction": direction} for set_name in SETS for sample in SET_TO_SAMPLES[set_name] for direction in ["f", "r"]]
def seqtab_names(input):
return ["data/{set}/11.merge/{sample}.rds".format(**d) for d in input if d["direction"]=="f"]
def derep_names(input, set_name, direction):
return ["data/{set}/9.derep/{sample}.{direction}.rds".format(**d) for d in input if d["set"] == set_name and d["direction"]==direction]
derep_random = {(set_name, direction): np.random.choice(derep_names(INPUT, set_name, direction), size=5, replace=False) for set_name in SETS for direction in ["f", "r"]}
localrules:
target
rule target:
input:
combined = "data/combined.seqtab"
rule convert_r:
input:
"data/{set}/1.cat/{sample}.{direction}.fq"
output:
fastq = "data/{set}/2.convert/{sample}.{direction}.fq"
run:
shell('{config[caravan_path]}van.py convert {input} -o {output[fastq]} -r')
rule trim_primer_f:
input:
"data/{set}/2.convert/{sample}.f.fq"
output:
"data/{set}/4.trim/{sample}.f.fq"
shell:
"/home/josephe/dev/caravan/van.py trim ACCCDATGGATCTKAAGCGYGG {input} -o {output} -w 25"
rule trim_primer_r:
input:
"data/{set}/2.convert/{sample}.r.fq"
output:
"data/{set}/4.trim/{sample}.r.fq"
shell:
"/home/josephe/dev/caravan/van.py trim CCRCGRTCGAACTGCATVCCTT {input} -o {output} -w 25"
rule intersect:
input:
f = "data/{set}/4.trim/{sample}.f.fq",
r = "data/{set}/4.trim/{sample}.r.fq"
output:
f = "data/{set}/7.intersect/{sample}.f.fq",
r = "data/{set}/7.intersect/{sample}.r.fq"
shell:
"/home/josephe/dev/caravan/van.py intersect_no_mapping {input[f]} {input[r]} {output[f]} {output[r]}"
rule filter_sample:
input:
f = "data/{set}/7.intersect/{sample}.f.fq",
r = "data/{set}/7.intersect/{sample}.r.fq",
output:
f = "data/{set}/8.filter/{sample}.f.fq",
r = "data/{set}/8.filter/{sample}.r.fq"
params:
f_truncLen = 115,
r_truncLen = 115
run:
R("""
library(ShortRead)
library(dada2)
fastqPairedFilter(
c("{input.f}", "{input.r}"), c("{output.f}", "{output.r}"),
truncLen=c({params.f_truncLen},{params.r_truncLen}),
maxN=0,
maxEE=2,
truncQ=2,
compress=FALSE,
verbose=TRUE
)
if (!file.exists("{output.f}")){{
file.create("{output.f}")
}}
if (!file.exists("{output.r}")){{
file.create("{output.r}")
}}
""")
rule derep_sample:
input:
fq = "data/{set}/8.filter/{sample}.{direction}.fq"
output:
derep = "data/{set}/9.derep/{sample}.{direction}.rds"
run:
R("""
library(ShortRead)
library(dada2)
derep <- derepFastq("{input.fq}", verbose=TRUE)
saveRDS(derep, "{output.derep}")
""")
rule infer_error:
input:
lambda wildcards: derep_random[(wildcards.set, wildcards.direction)]
output:
error_matrix = "data/{set}/error_matrix.{direction}.rds"
run:
input = ",".join(['"{}"'.format(i) for i in input])
R("""
library(ShortRead)
library(dada2)
debug <- lapply(c({input}), readRDS)
saveRDS(debug, "{wildcards.set}.{wildcards.direction}.derep.debug.rds")
drp.learn <- dada2:::combineDereps2(debug)
dd.learn <- dada(drp.learn, err=NULL, selfConsist=TRUE)
saveRDS(dd.learn, "{wildcards.set}.{wildcards.direction}.dada.debug.rds")
err <- dd.learn[[1]]["err_out"]
saveRDS(err, "{output.error_matrix}")
""")
rule denoise_sample:
input:
derep = "data/{set}/9.derep/{sample}.{direction}.rds",
error_matrix = "data/{set}/error_matrix.{direction}.rds"
output:
denoise = "data/{set}/10.denoise/{sample}.{direction}.rds"
run:
R("""
library(ShortRead)
library(dada2)
derep <- readRDS("{input.derep}")
error_matrix <- readRDS("{input.error_matrix}")
denoise <- dada(derep, err=NULL, selfConsist=FALSE)
saveRDS(derep, "{output.denoise}")
""")
rule merge_and_table_sample:
input:
derep_f = "data/{set}/9.derep/{sample}.f.rds",
denoise_f = "data/{set}/10.denoise/{sample}.f.rds",
derep_r = "data/{set}/9.derep/{sample}.r.rds",
denoise_r = "data/{set}/10.denoise/{sample}.r.rds"
output:
seqtab = "data/{set}/11.merge/{sample}.rds"
params:
minOverlap = 14
run:
R("""
library(ShortRead)
library(dada2)
derep_f <- readRDS("{input.derep_f}")
denoise_f <- readRDS("{input.denoise_f}")
derep_r <- readRDS("{input.derep_r}")
denoise_r <- readRDS("{input.denoise_r}")
merge <- mergePairs(denoise_f, derep_f, denoise_r, derep_r, minOverlap={params.minOverlap}, verbose=TRUE)
seqtab <- makeSequenceTable(merge)
saveRDS(seqtab, "{output.seqtab}")
""")
rule combine_tables:
input:
seqtab_names(INPUT)
output:
combined = "data/combined.seqtab"
run:
input = ','.join(['"{}"'.format(i) for i in input])
R("""
library(dada2)
library(ShortRead)
C_isACGT <- function(seqs) {{
.Call('dada2_C_isACGT', PACKAGE = 'dada2', seqs)
}}
is.sequence.table <- function(tab) {{
rval <- is.matrix(tab) && all(tab>=0) &&
!is.null(colnames(tab)) && !is.null(colnames(tab)) &&
all(sapply(colnames(tab), nchar)>0) &&
all(sapply(colnames(tab), C_isACGT)) &&
all(sapply(rownames(tab), nchar)>0)
rval
}}
mergeSequenceTables <- function(table1, table2, ..., orderBy = "abundance") {{
# Combine passed tables into a list
tables <- list(table1, table2)
tables <- c(tables, list(...))
# Validate tables
if(!(all(sapply(tables, is.sequence.table)))) {{
stop("At least two valid sequence tables, and no invalid objects, are expected.")
}}
sample.names <- rownames(tables[[1]])
for(i in seq(2, length(tables))) {{
sample.names <- c(sample.names, rownames(tables[[i]]))
}}
if(any(duplicated(sample.names))) {{
stop("Duplicated sample names detected in the rownames.")
}}
seqs <- unique(c(sapply(tables, colnames), recursive=TRUE))
# Make merged table
rval <- matrix(0L, nrow=length(sample.names), ncol=length(seqs))
rownames(rval) <- sample.names
colnames(rval) <- seqs
for(tab in tables) {{
rval[rownames(tab), colnames(tab)] <- tab
}}
# Order columns
if(!is.null(orderBy)) {{
if(orderBy == "abundance") {{
rval <- rval[,order(colSums(rval), decreasing=TRUE),drop=FALSE]
}} else if(orderBy == "nsamples") {{
rval <- rval[,order(colSums(rval>0), decreasing=TRUE),drop=FALSE]
}}
}}
rval
}}
combined <- mergeSequenceTables(lapply({input}), readRDS)
saveRDS(combined, "{output.combined}")
""")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment