Skip to content

Instantly share code, notes, and snippets.

@conchoecia
Last active April 13, 2018 13:24
Show Gist options
  • Save conchoecia/a35fcb654ae8decd132224b6a5c53eef to your computer and use it in GitHub Desktop.
Save conchoecia/a35fcb654ae8decd132224b6a5c53eef to your computer and use it in GitHub Desktop.
This gist takes a list of directories that contain fastq files and trims them, runs fastqc, and outputs a QC report with info on trimming efficiency and Hi-C linker sequence content.
import os
from pathlib import Path
import yaml
configfile: "directories.yaml"
# The yaml file should look like this below. Just an object called "directories"
# and a list of directories in which the pipeline should look for things.
#
# directories:
# - "/data/raw/genomic_reads/180315_M00160_0073_000000000-BNBV5/"
# - "/data/raw/genomic_reads/180123_M00160_0067_000000000-BKDV3/"
# - "/data/raw/genomic_reads/171018_M00160_0050_000000000-BFNHV/"
# vvvv LOOK HERE vvv
#
# your path to the directory containing trimmomatic-0.35.jar and adapters/TruSeq3-PE-2.fa
# goes here
trimmomatic = "/usr/local/bin/Trimmomatic-0.35"
ntpath = "/data/ncbi/db/nt"
#
#
# ^^^^ LOOK HERE ^^^
def is_fastq(filepath):
suffix_set = set(Path(filepath).suffixes)
fastqs = set([".fastq", ".fq"])
fastq_intersection = suffix_set & fastqs
if len(fastq_intersection) > 0:
return True
else:
return False
#now we must populate the sample list
config["samples"] = {}
for this_d in config["directories"]:
these_files = os.listdir(this_d)
fastqs = [filepath for filepath in these_files if is_fastq(filepath)]
samplelist = set()
for filepath in fastqs:
samplelist.add(filepath.split("_")[0])
for sample in samplelist:
sample_fastqs = sorted([filepath for filepath in fastqs if sample in filepath])
f = os.path.join(this_d, sample_fastqs[0])
r = os.path.join(this_d, sample_fastqs[1])
if sample in config["samples"]:
raise IOError("""Library {} was found in more than two directories.
Please only include one set of readpairs per library:
- {}
- {}""".format(sample, this_d, config["samples"][sample]["f"]))
else:
config["samples"][sample] = {}
config["samples"][sample]["f"] = f
config["samples"][sample]["r"] = r
rule all:
input:
expand("trimmed/{sample}_trimlog.txt", sample=config["samples"]),
expand("report/blast/{sample}_f.blast.txt", sample= config["samples"]),
expand("report/efficiency/{sample}.efficiency.txt", sample=config["samples"]),
expand("fastqc/raw/{sample}_f_fastqc.html", sample = config["samples"]),
expand("fastqc/raw/{sample}_f_fastqc.zip", sample = config["samples"]),
expand("fastqc/raw/{sample}_r_fastqc.html", sample = config["samples"]),
expand("fastqc/raw/{sample}_r_fastqc.zip", sample = config["samples"]),
expand("fastqc/trimmed/{sample}_f.trim_fastqc.html", sample = config["samples"]),
expand("fastqc/trimmed/{sample}_f.trim_fastqc.zip", sample = config["samples"]),
expand("fastqc/trimmed/{sample}_r.trim_fastqc.html", sample = config["samples"]),
expand("fastqc/trimmed/{sample}_r.trim_fastqc.zip", sample = config["samples"]),
expand("report/numreads/{sample}.numreads.txt",sample = config["samples"]),
expand("report/numreads/{sample}.trimmed.numreads.txt", sample = config["samples"]),
expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]),
expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]),
expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]),
expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]),
expand( "report/linkercontent/AATTAATT/{sample}_f.AATTAATT.count", sample = config["samples"]),
expand( "report/linkercontent/AATTAATT/{sample}_r.AATTAATT.count", sample = config["samples"]),
expand( "report/linkercontent/AATTAATT/{sample}_f.trim.AATTAATT.count", sample = config["samples"]),
expand( "report/linkercontent/AATTAATT/{sample}_r.trim.AATTAATT.count", sample = config["samples"]),
"report/final_report.txt"
rule make_fake_reads:
input:
[config["samples"][sample][readdir] \
for readdir in ['f', 'r'] \
for sample in config["samples"]]
output:
forward = sorted(expand("reads/{sample}_f.fastq.gz", sample=config["samples"])),
reverse = sorted(expand("reads/{sample}_r.fastq.gz", sample=config["samples"]))
message:
"making soft links"
run:
for this_sample in sorted(config["samples"]):
for this_direction in ["f", "r"]:
os.symlink(config["samples"][this_sample][this_direction],
"reads/{}_{}.fastq.gz".format(this_sample, this_direction))
rule trim_pairs:
input:
f1 = "reads/{sample}_f.fastq.gz",
f2 = "reads/{sample}_r.fastq.gz",
trim_jar = os.path.join(trimmomatic, "trimmomatic-0.35.jar"),
adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2.fa")
output:
f_paired = "trimmed/{sample}_f.trim.fastq.gz",
r_paired = "trimmed/{sample}_r.trim.fastq.gz",
f_unpaired = temp("trimmed/{sample}_1.trim.unpaired.fastq.gz"),
r_unpaired = temp("trimmed/{sample}_2.trim.unpaired.fastq.gz"),
trimlog = "trimmed/{sample}_trimlog.txt"
threads:
60
message:
"adapter and quality trimming files"
shell:
"""java -jar {input.trim_jar} PE \
-phred33 -threads {threads} \
-trimlog {output.trimlog}\
{input.f1} {input.f2} \
{output.f_paired} \
{output.f_unpaired} \
{output.r_paired} \
{output.r_unpaired} \
ILLUMINACLIP:{input.adapter_path}:2:30:10 \
LEADING:3 TRAILING:3 \
SLIDINGWINDOW:4:15 MINLEN:36"""
rule num_reads:
input:
forward_trim = "trimmed/{sample}_f.trim.fastq.gz",
f1 = "reads/{sample}_f.fastq.gz"
output:
numreads = "report/numreads/{sample}.numreads.txt",
trimmed = "report/numreads/{sample}.trimmed.numreads.txt"
shell:
"""echo $(bioawk -cfastx 'END{{print NR}}' {input.forward_trim}) > {output.trimmed}; \
echo $(bioawk -cfastx 'END{{print NR}}' {input.f1}) > {output.numreads}"""
rule library_efficiency:
input:
numreads = "report/numreads/{sample}.numreads.txt",
trimmed = "report/numreads/{sample}.trimmed.numreads.txt"
output:
"report/efficiency/{sample}.efficiency.txt"
message:
"calculating the library efficiency"
shell:
"""cat {input.trimmed} {input.numreads} | \
paste - - | \
awk '{{print $1/$2}}' > {output} """
rule fastq_to_fasta_fwd:
input:
forward = "trimmed/{sample}_f.trim.fastq.gz"
output:
temp("trimmed/{sample}_f.fasta")
message:
"Converting fastq forward file to fasta"
shell:
"seqtk seq -A {input.forward} > {output}"
rule sample_fasta:
input:
forward = "trimmed/{sample}_f.fasta"
output:
temp("trimmed/{sample}_f.subsample.fasta")
message:
"Sampling up to 100k reads from the fasta"
shell:
"""NUMREADS=$(bioawk -cfastx 'END{{print NR}}' {input.forward});\
if [ $NUMREADS > 100000 ]; then\
echo '{input.forward} has '"$NUMREADS"' reads. Sampling 100k.'; \
seqtk sample -s100 {input.forward} 100000 > {output};\
else\
echo '{input.forward} has '"$NUMREADS"' reads. Not sampling.';\
cp {input.forward} {output};\
fi
"""
rule blast_sample:
input:
forward = "trimmed/{sample}_f.subsample.fasta"
output:
"report/blast/{sample}_f.blast.txt"
message:
"BLAST the reads"
params:
pntpath = ntpath
threads:
96
shell:
"""blastn -query {input.forward} \
-db {params.pntpath} -outfmt 6 \
-out {output} -num_threads {threads}"""
rule fastqc_raw:
input:
f1 = "reads/{sample}_f.fastq.gz",
f2 = "reads/{sample}_r.fastq.gz",
adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2-fastqc.fa")
output:
"fastqc/raw/{sample}_f_fastqc.html",
"fastqc/raw/{sample}_f_fastqc.zip",
"fastqc/raw/{sample}_r_fastqc.html",
"fastqc/raw/{sample}_r_fastqc.zip"
message:
"making a fastqc report of the raw reads"
params:
raw_dir = "fastqc/raw/"
shell:
"""fastqc -a {input.adapter_path} \
-o {params.raw_dir} \
{input.f1} {input.f2}"""
rule fastqc_trimmed:
input:
f1 = "trimmed/{sample}_f.trim.fastq.gz",
f2 = "trimmed/{sample}_r.trim.fastq.gz",
adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2-fastqc.fa")
output:
"fastqc/trimmed/{sample}_f.trim_fastqc.html",
"fastqc/trimmed/{sample}_f.trim_fastqc.zip",
"fastqc/trimmed/{sample}_r.trim_fastqc.html",
"fastqc/trimmed/{sample}_r.trim_fastqc.zip"
message:
"making a fastqc report of the trimmed reads"
params:
raw_dir = "fastqc/trimmed/"
shell:
"""fastqc -a {input.adapter_path} \
-o {params.raw_dir} \
{input.f1} {input.f2}"""
rule GATCGATC_f:
input:
f1 = "reads/{sample}_f.fastq.gz",
f2 = "reads/{sample}_r.fastq.gz",
f1t = "trimmed/{sample}_f.trim.fastq.gz",
f2t = "trimmed/{sample}_r.trim.fastq.gz"
output:
f1o = "report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count",
f2o = "report/linkercontent/GATCGATC/{sample}_r.GATCGATC.count",
f1to = "report/linkercontent/GATCGATC/{sample}_f.trim.GATCGATC.count",
f2to = "report/linkercontent/GATCGATC/{sample}_r.trim.GATCGATC.count"
shell:
"set +o pipefail; "
"bioawk -cfastx '{{print $seq}}' {input.f1} | grep 'GATCGATC' - | wc -l > {output.f1o}; "
"bioawk -cfastx '{{print $seq}}' {input.f2} | grep 'GATCGATC' - | wc -l > {output.f2o}; "
"bioawk -cfastx '{{print $seq}}' {input.f1t} | grep 'GATCGATC' - | wc -l > {output.f1to}; "
"bioawk -cfastx '{{print $seq}}' {input.f2t} | grep 'GATCGATC' - | wc -l > {output.f2to}"
rule AATTAATT_num:
input:
f1 = "reads/{sample}_f.fastq.gz",
f2 = "reads/{sample}_r.fastq.gz",
f1t = "trimmed/{sample}_f.trim.fastq.gz",
f2t = "trimmed/{sample}_r.trim.fastq.gz"
output:
f1o = "report/linkercontent/AATTAATT/{sample}_f.AATTAATT.count",
f2o = "report/linkercontent/AATTAATT/{sample}_r.AATTAATT.count",
f1to = "report/linkercontent/AATTAATT/{sample}_f.trim.AATTAATT.count",
f2to = "report/linkercontent/AATTAATT/{sample}_r.trim.AATTAATT.count"
shell:
"set +o pipefail; "
"bioawk -cfastx '{{print $seq}}' {input.f1} | grep 'AATTAATT' - | wc -l > {output.f1o}; "
"bioawk -cfastx '{{print $seq}}' {input.f2} | grep 'AATTAATT' - | wc -l > {output.f2o}; "
"bioawk -cfastx '{{print $seq}}' {input.f1t} | grep 'AATTAATT' - | wc -l > {output.f1to}; "
"bioawk -cfastx '{{print $seq}}' {input.f2t} | grep 'AATTAATT' - | wc -l > {output.f2to}"
def read_number_from_file(filename):
with open(filename, "r") as f:
for line in f:
if line.strip():
return line.strip()
rule collate_report:
""" this method prints out the final qc report of all the libraries."""
input:
raw_num = expand("report/numreads/{sample}.numreads.txt", sample=config["samples"]),
trim_num = expand("report/numreads/{sample}.trimmed.numreads.txt", sample=config["samples"]),
efficiency = expand("report/efficiency/{sample}.efficiency.txt", sample=config["samples"]),
f1dpn = expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample=config["samples"]),
f2dpn = expand("report/linkercontent/GATCGATC/{sample}_r.GATCGATC.count", sample=config["samples"]),
f1mluc = expand("report/linkercontent/AATTAATT/{sample}_f.AATTAATT.count", sample=config["samples"]),
f2mluc = expand("report/linkercontent/AATTAATT/{sample}_r.AATTAATT.count", sample=config["samples"]),
f1tdpn = expand("report/linkercontent/GATCGATC/{sample}_f.trim.GATCGATC.count", sample=config["samples"]),
f2tdpn = expand("report/linkercontent/GATCGATC/{sample}_r.trim.GATCGATC.count", sample=config["samples"]),
f1tmluc = expand("report/linkercontent/AATTAATT/{sample}_f.trim.AATTAATT.count", sample=config["samples"]),
f2tmluc = expand("report/linkercontent/AATTAATT/{sample}_r.trim.AATTAATT.count", sample=config["samples"])
output:
"report/final_report.txt"
run:
finalout_handle = open(output[0], "w")
#print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
print("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}".format(
"libname",
"num_reads",
"num_reads_trimmed",
"trim_efficiency",
"f_GATCGATC",
"r_GATCGATC",
"f_AATTAATT",
"r_AATTAATT",
"f_trim_GATCGATC",
"r_trim_GATCGATC",
"f_trim_AATTAATT",
"r_trim_AATTAATT"),
file = finalout_handle)
for thiss in sorted(config["samples"]):
num_reads = read_number_from_file("report/numreads/{}.numreads.txt".format(thiss))
nreads_tr = read_number_from_file("report/numreads/{}.trimmed.numreads.txt".format(thiss))
trim_effi = read_number_from_file("report/efficiency/{}.efficiency.txt".format(thiss))
f_GATCGAT = read_number_from_file("report/linkercontent/GATCGATC/{}_f.GATCGATC.count".format(thiss))
r_GATCGAT = read_number_from_file("report/linkercontent/GATCGATC/{}_r.GATCGATC.count".format(thiss))
f_AATTAAT = read_number_from_file("report/linkercontent/AATTAATT/{}_f.AATTAATT.count".format(thiss))
r_AATTAAT = read_number_from_file("report/linkercontent/AATTAATT/{}_r.AATTAATT.count".format(thiss))
f_trim_GA = read_number_from_file("report/linkercontent/GATCGATC/{}_f.trim.GATCGATC.count".format(thiss))
r_trim_GA = read_number_from_file("report/linkercontent/GATCGATC/{}_r.trim.GATCGATC.count".format(thiss))
f_trim_AA = read_number_from_file("report/linkercontent/AATTAATT/{}_f.trim.AATTAATT.count".format(thiss))
r_trim_AA = read_number_from_file("report/linkercontent/AATTAATT/{}_r.trim.AATTAATT.count".format(thiss))
#print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
print("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}".format(
thiss,
num_reads,
nreads_tr,
trim_effi,
int(f_GATCGAT)/int(num_reads),
int(r_GATCGAT)/int(num_reads),
int(f_AATTAAT)/int(num_reads),
int(r_AATTAAT)/int(num_reads),
int(f_trim_GA)/int(nreads_tr),
int(r_trim_GA)/int(nreads_tr),
int(f_trim_AA)/int(nreads_tr),
int(r_trim_AA)/int(nreads_tr)), file = finalout_handle)
finalout_handle.close()
import os
from pathlib import Path
import yaml
configfile: "directories.yaml"
# The yaml file should look like this below. Just an object called "directories"
# and a list of directories in which the pipeline should look for things.
#
# directories:
# - "/data/raw/genomic_reads/180315_M00160_0073_000000000-BNBV5/"
# - "/data/raw/genomic_reads/180123_M00160_0067_000000000-BKDV3/"
# - "/data/raw/genomic_reads/171018_M00160_0050_000000000-BFNHV/"
# vvvv LOOK HERE vvv
#
# your path to the directory containing trimmomatic-0.35.jar and adapters/TruSeq3-PE-2.fa
# goes here
trimmomatic = "/usr/local/bin/Trimmomatic-0.35"
ntpath = "/data/ncbi/db/nt"
#
#
# ^^^^ LOOK HERE ^^^
def is_fastq(filepath):
suffix_set = set(Path(filepath).suffixes)
fastqs = set([".fastq", ".fq"])
fastq_intersection = suffix_set & fastqs
if len(fastq_intersection) > 0:
return True
else:
return False
#now we must populate the sample list
config["samples"] = {}
for this_d in config["directories"]:
these_files = os.listdir(this_d)
fastqs = [filepath for filepath in these_files if is_fastq(filepath)]
samplelist = set()
for filepath in fastqs:
samplelist.add(filepath.split("_")[0])
for sample in samplelist:
sample_fastqs = sorted([filepath for filepath in fastqs if sample in filepath])
f = os.path.join(this_d, sample_fastqs[0])
r = os.path.join(this_d, sample_fastqs[1])
if sample in config["samples"]:
raise IOError("""Library {} was found in more than two directories.
Please only include one set of readpairs per library:
- {}
- {}""".format(sample, this_d, config["samples"][sample]["f"]))
else:
config["samples"][sample] = {}
config["samples"][sample]["f"] = f
config["samples"][sample]["r"] = r
rule all:
input:
expand("trimmed/{sample}_trimlog.txt", sample=config["samples"]),
expand("report/blast/{sample}_f.blast.txt", sample= config["samples"]),
expand("report/efficiency/{sample}.efficiency.txt", sample=config["samples"]),
expand("fastqc/raw/{sample}_f_fastqc.html", sample = config["samples"]),
expand("fastqc/raw/{sample}_f_fastqc.zip", sample = config["samples"]),
expand("fastqc/raw/{sample}_r_fastqc.html", sample = config["samples"]),
expand("fastqc/raw/{sample}_r_fastqc.zip", sample = config["samples"]),
expand("fastqc/trimmed/{sample}_f.trim_fastqc.html", sample = config["samples"]),
expand("fastqc/trimmed/{sample}_f.trim_fastqc.zip", sample = config["samples"]),
expand("fastqc/trimmed/{sample}_r.trim_fastqc.html", sample = config["samples"]),
expand("fastqc/trimmed/{sample}_r.trim_fastqc.zip", sample = config["samples"]),
expand("report/numreads/{sample}.numreads.txt",sample = config["samples"]),
expand("report/numreads/{sample}.trimmed.numreads.txt", sample = config["samples"]),
expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]),
expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]),
expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]),
expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]),
expand( "report/linkercontent/AATTAATT/{sample}_f.AATTAATT.count", sample = config["samples"]),
expand( "report/linkercontent/AATTAATT/{sample}_r.AATTAATT.count", sample = config["samples"]),
expand( "report/linkercontent/AATTAATT/{sample}_f.trim.AATTAATT.count", sample = config["samples"]),
expand( "report/linkercontent/AATTAATT/{sample}_r.trim.AATTAATT.count", sample = config["samples"]),
"report/final_report.txt"
rule make_fake_reads:
input:
[config["samples"][sample][readdir] \
for readdir in ['f', 'r'] \
for sample in config["samples"]]
output:
forward = sorted(expand("reads/{sample}_f.fastq.gz", sample=config["samples"])),
reverse = sorted(expand("reads/{sample}_r.fastq.gz", sample=config["samples"]))
message:
"making soft links"
run:
for this_sample in sorted(config["samples"]):
for this_direction in ["f", "r"]:
os.symlink(config["samples"][this_sample][this_direction],
"reads/{}_{}.fastq.gz".format(this_sample, this_direction))
rule trim_pairs:
input:
f1 = "reads/{sample}_f.fastq.gz",
f2 = "reads/{sample}_r.fastq.gz",
trim_jar = os.path.join(trimmomatic, "trimmomatic-0.35.jar"),
adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2.fa")
output:
f_paired = "trimmed/{sample}_f.trim.fastq.gz",
r_paired = "trimmed/{sample}_r.trim.fastq.gz",
f_unpaired = temp("trimmed/{sample}_1.trim.unpaired.fastq.gz"),
r_unpaired = temp("trimmed/{sample}_2.trim.unpaired.fastq.gz"),
trimlog = "trimmed/{sample}_trimlog.txt"
threads:
60
message:
"adapter and quality trimming files"
shell:
"""java -jar {input.trim_jar} PE \
-phred33 -threads {threads} \
-trimlog {output.trimlog}\
{input.f1} {input.f2} \
{output.f_paired} \
{output.f_unpaired} \
{output.r_paired} \
{output.r_unpaired} \
ILLUMINACLIP:{input.adapter_path}:2:30:10 \
LEADING:3 TRAILING:3 \
SLIDINGWINDOW:4:15 MINLEN:36"""
rule num_reads:
input:
forward_trim = "trimmed/{sample}_f.trim.fastq.gz",
f1 = "reads/{sample}_f.fastq.gz"
output:
numreads = "report/numreads/{sample}.numreads.txt",
trimmed = "report/numreads/{sample}.trimmed.numreads.txt"
shell:
"""echo $(bioawk -cfastx 'END{{print NR}}' {input.forward_trim}) > {output.trimmed}; \
echo $(bioawk -cfastx 'END{{print NR}}' {input.f1}) > {output.numreads}"""
rule library_efficiency:
input:
numreads = "report/numreads/{sample}.numreads.txt",
trimmed = "report/numreads/{sample}.trimmed.numreads.txt"
output:
"report/efficiency/{sample}.efficiency.txt"
message:
"calculating the library efficiency"
shell:
"""cat {input.trimmed} {input.numreads} | \
paste - - | \
awk '{{print $1/$2}}' > {output} """
rule fastq_to_fasta_fwd:
input:
forward = "trimmed/{sample}_f.trim.fastq.gz"
output:
temp("trimmed/{sample}_f.fasta")
message:
"Converting fastq forward file to fasta"
shell:
"seqtk seq -A {input.forward} > {output}"
rule sample_fasta:
input:
forward = "trimmed/{sample}_f.fasta"
output:
temp("trimmed/{sample}_f.subsample.fasta")
message:
"Sampling up to 100k reads from the fasta"
shell:
"""NUMREADS=$(bioawk -cfastx 'END{{print NR}}' {input.forward});\
if [ $NUMREADS > 100000 ]; then\
echo '{input.forward} has '"$NUMREADS"' reads. Sampling 100k.'; \
seqtk sample -s100 {input.forward} 100000 > {output};\
else\
echo '{input.forward} has '"$NUMREADS"' reads. Not sampling.';\
cp {input.forward} {output};\
fi
"""
rule blast_sample:
input:
forward = "trimmed/{sample}_f.subsample.fasta"
output:
"report/blast/{sample}_f.blast.txt"
message:
"BLAST the reads"
params:
pntpath = ntpath
threads:
96
shell:
"""blastn -query {input.forward} \
-db {params.pntpath} -outfmt 6 \
-out {output} -num_threads {threads}"""
rule fastqc_raw:
input:
f1 = "reads/{sample}_f.fastq.gz",
f2 = "reads/{sample}_r.fastq.gz",
adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2-fastqc.fa")
output:
"fastqc/raw/{sample}_f_fastqc.html",
"fastqc/raw/{sample}_f_fastqc.zip",
"fastqc/raw/{sample}_r_fastqc.html",
"fastqc/raw/{sample}_r_fastqc.zip"
message:
"making a fastqc report of the raw reads"
params:
raw_dir = "fastqc/raw/"
shell:
"""fastqc -a {input.adapter_path} \
-o {params.raw_dir} \
{input.f1} {input.f2}"""
rule fastqc_trimmed:
input:
f1 = "trimmed/{sample}_f.trim.fastq.gz",
f2 = "trimmed/{sample}_r.trim.fastq.gz",
adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2-fastqc.fa")
output:
"fastqc/trimmed/{sample}_f.trim_fastqc.html",
"fastqc/trimmed/{sample}_f.trim_fastqc.zip",
"fastqc/trimmed/{sample}_r.trim_fastqc.html",
"fastqc/trimmed/{sample}_r.trim_fastqc.zip"
message:
"making a fastqc report of the trimmed reads"
params:
raw_dir = "fastqc/trimmed/"
shell:
"""fastqc -a {input.adapter_path} \
-o {params.raw_dir} \
{input.f1} {input.f2}"""
rule GATCGATC_f:
input:
f1 = "reads/{sample}_f.fastq.gz",
f2 = "reads/{sample}_r.fastq.gz",
f1t = "trimmed/{sample}_f.trim.fastq.gz",
f2t = "trimmed/{sample}_r.trim.fastq.gz"
output:
f1o = "report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count",
f2o = "report/linkercontent/GATCGATC/{sample}_r.GATCGATC.count",
f1to = "report/linkercontent/GATCGATC/{sample}_f.trim.GATCGATC.count",
f2to = "report/linkercontent/GATCGATC/{sample}_r.trim.GATCGATC.count"
shell:
"set +o pipefail; "
"bioawk -cfastx '{{print $seq}}' {input.f1} | grep 'GATCGATC' - | wc -l > {output.f1o}; "
"bioawk -cfastx '{{print $seq}}' {input.f2} | grep 'GATCGATC' - | wc -l > {output.f2o}; "
"bioawk -cfastx '{{print $seq}}' {input.f1t} | grep 'GATCGATC' - | wc -l > {output.f1to}; "
"bioawk -cfastx '{{print $seq}}' {input.f2t} | grep 'GATCGATC' - | wc -l > {output.f2to}"
rule AATTAATT_num:
input:
f1 = "reads/{sample}_f.fastq.gz",
f2 = "reads/{sample}_r.fastq.gz",
f1t = "trimmed/{sample}_f.trim.fastq.gz",
f2t = "trimmed/{sample}_r.trim.fastq.gz"
output:
f1o = "report/linkercontent/AATTAATT/{sample}_f.AATTAATT.count",
f2o = "report/linkercontent/AATTAATT/{sample}_r.AATTAATT.count",
f1to = "report/linkercontent/AATTAATT/{sample}_f.trim.AATTAATT.count",
f2to = "report/linkercontent/AATTAATT/{sample}_r.trim.AATTAATT.count"
shell:
"set +o pipefail; "
"bioawk -cfastx '{{print $seq}}' {input.f1} | grep 'AATTAATT' - | wc -l > {output.f1o}; "
"bioawk -cfastx '{{print $seq}}' {input.f2} | grep 'AATTAATT' - | wc -l > {output.f2o}; "
"bioawk -cfastx '{{print $seq}}' {input.f1t} | grep 'AATTAATT' - | wc -l > {output.f1to}; "
"bioawk -cfastx '{{print $seq}}' {input.f2t} | grep 'AATTAATT' - | wc -l > {output.f2to}"
def read_number_from_file(filename):
with open(filename, "r") as f:
for line in f:
if line.strip():
return line.strip()
rule collate_report:
""" this method prints out the final qc report of all the libraries."""
input:
raw_num = expand("report/numreads/{sample}.numreads.txt", sample=config["samples"]),
trim_num = expand("report/numreads/{sample}.trimmed.numreads.txt", sample=config["samples"]),
efficiency = expand("report/efficiency/{sample}.efficiency.txt", sample=config["samples"]),
f1dpn = expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample=config["samples"]),
f2dpn = expand("report/linkercontent/GATCGATC/{sample}_r.GATCGATC.count", sample=config["samples"]),
f1mluc = expand("report/linkercontent/AATTAATT/{sample}_f.AATTAATT.count", sample=config["samples"]),
f2mluc = expand("report/linkercontent/AATTAATT/{sample}_r.AATTAATT.count", sample=config["samples"]),
f1tdpn = expand("report/linkercontent/GATCGATC/{sample}_f.trim.GATCGATC.count", sample=config["samples"]),
f2tdpn = expand("report/linkercontent/GATCGATC/{sample}_r.trim.GATCGATC.count", sample=config["samples"]),
f1tmluc = expand("report/linkercontent/AATTAATT/{sample}_f.trim.AATTAATT.count", sample=config["samples"]),
f2tmluc = expand("report/linkercontent/AATTAATT/{sample}_r.trim.AATTAATT.count", sample=config["samples"])
output:
"report/final_report.txt"
run:
finalout_handle = open(output[0], "w")
#print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
print("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}".format(
"libname",
"num_reads",
"num_reads_trimmed",
"trim_efficiency",
"f_GATCGATC",
"r_GATCGATC",
"f_AATTAATT",
"r_AATTAATT",
"f_trim_GATCGATC",
"r_trim_GATCGATC",
"f_trim_AATTAATT",
"r_trim_AATTAATT"),
file = finalout_handle)
for thiss in sorted(config["samples"]):
num_reads = read_number_from_file("report/numreads/{}.numreads.txt".format(thiss))
nreads_tr = read_number_from_file("report/numreads/{}.trimmed.numreads.txt".format(thiss))
trim_effi = read_number_from_file("report/efficiency/{}.efficiency.txt".format(thiss))
f_GATCGAT = read_number_from_file("report/linkercontent/GATCGATC/{}_f.GATCGATC.count".format(thiss))
r_GATCGAT = read_number_from_file("report/linkercontent/GATCGATC/{}_r.GATCGATC.count".format(thiss))
f_AATTAAT = read_number_from_file("report/linkercontent/AATTAATT/{}_f.AATTAATT.count".format(thiss))
r_AATTAAT = read_number_from_file("report/linkercontent/AATTAATT/{}_r.AATTAATT.count".format(thiss))
f_trim_GA = read_number_from_file("report/linkercontent/GATCGATC/{}_f.trim.GATCGATC.count".format(thiss))
r_trim_GA = read_number_from_file("report/linkercontent/GATCGATC/{}_r.trim.GATCGATC.count".format(thiss))
f_trim_AA = read_number_from_file("report/linkercontent/AATTAATT/{}_f.trim.AATTAATT.count".format(thiss))
r_trim_AA = read_number_from_file("report/linkercontent/AATTAATT/{}_r.trim.AATTAATT.count".format(thiss))
#print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
print("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}".format(
thiss,
num_reads,
nreads_tr,
trim_effi,
int(f_GATCGAT)/int(num_reads),
int(r_GATCGAT)/int(num_reads),
int(f_AATTAAT)/int(num_reads),
int(r_AATTAAT)/int(num_reads),
int(f_trim_GA)/int(nreads_tr),
int(r_trim_GA)/int(nreads_tr),
int(f_trim_AA)/int(nreads_tr),
int(r_trim_AA)/int(nreads_tr)), file = finalout_handle)
finalout_handle.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment