Skip to content

Instantly share code, notes, and snippets.

@conchoecia
Last active June 26, 2018 18:34
Show Gist options
  • Save conchoecia/d48da395d7b5d48a9c16c5eb6a9874e3 to your computer and use it in GitHub Desktop.
Save conchoecia/d48da395d7b5d48a9c16c5eb6a9874e3 to your computer and use it in GitHub Desktop.
Snakemake Trim adapters and quality-trim 10X Chromium data from HiSeq.
import glob
import os
"""
snakemake interprets some output files as being ambiguous since the processing pipeline is not a DAG.
as a results you must use --allow-ambiguity
example usage:
snakemake --cores 90
"""
# vvvv LOOK HERE vvv
#
# your path to the directory containing trimmomatic-0.35.jar and adapters/TruSeq3-PE-2.fa
# goes here
trimmomatic = "/usr/local/bin/Trimmomatic-0.35"
maxthreads = 90
#
#
# ^^^^ LOOK HERE ^^^
filenames = [os.path.basename(f) for f in glob.glob("*_R*_*.fastq.gz")]
pairdict = {x.replace("_R1", "").replace("_R2", "").replace(".fastq.gz",""):[]
for x in filenames}
for sample in filenames:
pairdict[sample.replace("_R1", "").replace("_R2", "").replace(".fastq.gz","")].append(sample)
for key in pairdict:
pairdict[key] = sorted(pairdict[key])
samples = list(pairdict.keys())
#make symlinks to more easily process the links
for key in pairdict:
for i in range(len(pairdict[key])):
if i == 0:
output_filepath = "{}_1.fastq.gz".format(key)
elif i == 1:
output_filepath = "{}_2.fastq.gz".format(key)
if os.path.isfile(output_filepath):
os.remove(output_filepath)
shell("ln -sr {0} {1}".format(pairdict[key][i], output_filepath))
rule all:
input:
expand("trimmed/{sample}_1.trim.final.fastq.gz", sample=samples),
expand("trimmed/{sample}_2.trim.final.fastq.gz", sample=samples),
expand("trimmed/{sample}_1.trim.unpaired.fastq.gz", sample=samples),
expand("trimmed/{sample}_2.trim.unpaired.fastq.gz", sample=samples)
rule trim_forward:
input:
fastq = "{sample}_1.fastq.gz",
trim_jar = os.path.join(trimmomatic, "trimmomatic-0.35.jar")
output:
temp("trimmed/{sample}_1.trim.fastq.gz")
threads:
maxthreads
message:
"Trimming the first 24 bases from {input.fastq}"
shell:
"""java -jar {input.trim_jar} SE -threads {threads} \
{input.fastq} {output} HEADCROP:24"""
rule trim_reverse:
input:
fastq = "{sample}_2.fastq.gz",
trim_jar = os.path.join(trimmomatic, "trimmomatic-0.35.jar")
output:
temp("trimmed/{sample}_2.trim.fastq.gz")
threads:
maxthreads
message:
"Trimming the first 3 bases from {input.fastq}"
shell:
"""java -jar {input.trim_jar} SE -threads {threads} \
{input.fastq} {output} HEADCROP:3"""
rule trim_pairs:
input:
f1 = "trimmed/{sample}_1.trim.fastq.gz",
f2 = "trimmed/{sample}_2.trim.fastq.gz",
trim_jar = os.path.join(trimmomatic, "trimmomatic-0.35.jar"),
adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2.fa")
output:
f_paired = "trimmed/{sample}_1.trim.final.fastq.gz",
r_paired = "trimmed/{sample}_2.trim.final.fastq.gz",
f_unpaired = "trimmed/{sample}_1.trim.unpaired.fastq.gz",
r_unpaired = "trimmed/{sample}_2.trim.unpaired.fastq.gz"
threads:
maxthreads
message:
"Trimming {input.f1} and {input.f2} as a pair."
shell:
"""java -jar {input.trim_jar} PE \
-threads {threads} \
-phred33 {input.f1} {input.f2} \
{output.f_paired} \
{output.f_unpaired} \
{output.r_paired} \
{output.r_unpaired} \
ILLUMINACLIP:{input.adapter_path}:2:30:10:1:TRUE\
LEADING:3 TRAILING:3 \
SLIDINGWINDOW:4:15 MINLEN:36"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment