conchoecia/Snakefile

## config.yaml
samples:
    Thalassicolla_SRR7533167:
        f: "/rna_reads/SRA/SRR7533167_1.fastq.gz"
        r: "/rna_reads/SRA/SRR7533167_2.fastq.gz"
        SS_lib_type: "RF"
        library: "SRR7533167"
        GLO: "NA"

## Snakefile
"""
Author: Darrin Schultz @conchoecia
File: Transcriptome assembly, annotation, and db creation

Instructions:
  - To run this script and all of its analyses: make sure that python 3,
    snakemake, and biopython are installed on your Unix computer.
  - Execute the following command: `snakemake --cores 45`, replacing `45`
    with the number of threads available on your machine.
"""
import os
import sys
from Bio import SeqIO
import shutil

configfile: "config.yaml"
trimmomatic = "/usr/local/bin/Trimmomatic-0.35"
maxthreads = 90
dockeruser = "YOUR_USERNAME_HERE"
curr_dir = os.getcwd()
print(curr_dir)

config["rna_f"] = {}
config["rna_r"] = {}
config["input_reads"] = []
for sample in config["samples"]:
   config["input_reads"].append(config["samples"][sample]["f"])
   config["input_reads"].append(config["samples"][sample]["r"])

# Now we need to change how the SS_lib_type_param is handled
for sample in config["samples"]:
   thisss_lib = config["samples"][sample]["SS_lib_type"]
   if thisss_lib in ["None", "NA", "none", "no", "nein"]:
      config["samples"][sample]["SS_lib_type"] = ""
   elif thisss_lib in ["rf", "RF", "Rf", "rF"]:
      config["samples"][sample]["SS_lib_type"] = "--SS_lib_type RF"
   elif thisss_lib in ["fr", "FR", "Fr", "fR"]:
      config["samples"][sample]["SS_lib_type"] = "--SS_lib_type FR"

def read_number_from_file(filename):
    with open(filename, "r") as f:
       for line in f:
           if line.strip():
               return line.strip()

def singleline_to_multiline(infile, outfile):
   """
   This function takes a fasta file and wraps it to N characters
   """
   out_handle = open(outfile, 'w') #open outfile for writing

   with open(infile, "rU") as handle:
      for record in SeqIO.parse(handle, "fasta"):
         SeqIO.write(record, out_handle, "fasta")
   out_handle.close()

rule all:
    input:
        #make the symlinks
        expand("reads/{sample}_f.fastq.gz", sample = config["samples"]),
        expand("reads/{sample}_r.fastq.gz", sample = config["samples"]),
        ##trim the reads
        expand("trimmed/{sample}_{readtype}.trim.fastq.gz", sample = config["samples"], readtype = ["f", "r"]),
        #now assemble the transcriptomes
        expand("txomes/raw/trinity_{sample}_raw.fasta", sample = config["samples"]),
        # make it multi-line
        expand("txomes/final/{sample}.fasta", sample = config["samples"]),
        # transdecoder and translate, rename the pep files
        expand("pepfiles/final/{sample}.pep", sample = config["samples"]),
        # softlinks for /data/ncbi/db
        expand("/data/ncbi/db/{sample}.fasta",  sample = config["samples"]),
        expand("/data/ncbi/db/{sample}.pep",  sample = config["samples"]),
        # make the blast databases
        expand("/data/ncbi/db/{sample}.fasta.nhr",  sample = config["samples"]),
        expand("/data/ncbi/db/{sample}.pep.phr",  sample = config["samples"]),
        expand("/data/ncbi/db/{sample}.dmnd",  sample = config["samples"]),
        # make the report
        expand("info/counts/raw/{sample}_raw_count.txt", sample = config["samples"]),
        expand("info/counts/trimmed/{sample}_trimmed_count.txt", sample = config["samples"]),
        expand("info/trimming/txome/{sample}_qual_rejects.txt", sample = config["samples"]),
        expand("info/trimming/txome/{sample}_adapter_rejects.txt", sample = config["samples"]),
        expand("info/trimming/txome/{sample}_gc_rejects.txt", sample = config["samples"]),
        expand("info/counts/fasta/{sample}_fasta_count.txt", sample = config["samples"]),
        expand("info/counts/peps/{sample}_pep_count.txt", sample = config["samples"]),
        expand("info/counts/fasta/{sample}_N50.txt", sample = config["samples"]),
        expand("info/counts/peps/{sample}_N50.txt", sample = config["samples"]),
        expand("info/counts/fasta/{sample}_meanlen.txt", sample = config["samples"]),
        expand("info/counts/fasta/{sample}_N90.txt", sample = config["samples"]),
        expand("info/counts/fasta/{sample}_mb.txt", sample = config["samples"]),
        "report/final_report.txt"

###############################################################
#  __   __   ___  __   __   __   __   ___  __   __  _       __
# |__) |__) |__  |__) |__) /  \ /  ` |__  /__` /__` | |\ | / _`
# |    |  \ |___ |    |  \ \__/ \__, |___ .__/ .__/ | | \| \__>
#
#  These are the rules for preprocessing the inputs in order
#   to prepare for the rest of the pipeline.

# preprocessing rule 1
rule make_symlink_f:
   output:
       illumina_f = "reads/{sample}_f.fastq.gz",
   run:
       if not os.path.exists("reads"):
          print("Making a directory called 'reads'.", file = sys.stderr)
          os.makedirs("reads")
       else:
          print("The 'reads' directory exists already.", file = sys.stderr)

       #make symlinks for the illumina reads
       print("Making read symlinks.", file =sys.stderr)
       print("  - Checking files for sample {}".format(wildcards.sample), file=sys.stderr)
       rna_f = "reads/{}_f.fastq.gz".format(wildcards.sample)
       if not os.path.exists(rna_f):
          print("    - Making a symlink for {}".format(rna_f), file=sys.stderr)
          reads_path = config["samples"][wildcards.sample]["f"]
          if not os.path.exists(reads_path):
             raise Exception("{} does not exist.".format(reads_path))
          os.symlink(reads_path, rna_f)
       else:
          print("    - A symlink already exists for {}".format(rna_f), file=sys.stderr)
rule make_symlink_r:
   output:
       illumina_r = "reads/{sample}_r.fastq.gz",
   run:
       if not os.path.exists("reads"):
          print("Making a directory called 'reads'.", file = sys.stderr)
          os.makedirs("reads")
       else:
          print("The 'reads' directory exists already.", file = sys.stderr)

       #make symlinks for the illumina reads
       print("Making read symlinks.", file =sys.stderr)
       for sample in config["samples"]:
          print("  - Checking files for sample {}".format(wildcards.sample), file=sys.stderr)
          rna_r = "reads/{}_r.fastq.gz".format(wildcards.sample)

          if not os.path.exists(rna_r):
             print("    - Making a symlink for {}".format(rna_r), file=sys.stderr)
             reads_path = config["samples"][wildcards.sample]["r"]
             if not os.path.exists(reads_path):
                raise Exception("{} does not exist.".format(reads_path))
             os.symlink(reads_path, rna_r)
          else:
             print("    - A symlink already exists for {}".format(rna_r), file=sys.stderr)

#
# end of rules for preprocessing
#
###########################################################


##############################################################
#       _ ____                _
#     (_) / /_  ______ ___  (_)___  ____ _
#    / / / / / / / __ `__ \/ / __ \/ __ `/
#   / / / / /_/ / / / / / / / / / / /_/ /
#  /_/_/_/\__,_/_/ /_/ /_/_/_/ /_/\__,_/
#
#  These are the rules for working with the Illumina reads.

# illumina rule 1
rule trim_pairs:
    input:
        f = "reads/{sample}_f.fastq.gz",
        r = "reads/{sample}_r.fastq.gz",
        trim_jar = os.path.join(trimmomatic, "trimmomatic-0.35.jar"),
        adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2.fa")
    output:
        f = "trimmed/{sample}_f.trim.fastq.gz",
        r = "trimmed/{sample}_r.trim.fastq.gz",
        u1= temp("trimmed/{sample}_f.trim.unpaired.fastq.gz"),
        u2= temp("trimmed/{sample}_r.trim.unpaired.fastq.gz")
    threads:
        maxthreads
    shell:
        """java -jar {input.trim_jar} PE \
        -phred33 -threads {threads} \
        {input.f} {input.r} \
        {output.f} \
        {output.u1} \
        {output.r} \
        {output.u2} \
        ILLUMINACLIP:{input.adapter_path}:2:30:10:1:TRUE \
        LEADING:3 TRAILING:3 \
        SLIDINGWINDOW:4:15 MINLEN:36"""

# illumina rule 2
rule assemble_txome:
    input:
        f_paired = "trimmed/{sample}_f.trim.fastq.gz",
        r_paired = "trimmed/{sample}_r.trim.fastq.gz"
    output:
        assemblypath = "txomes/raw/trinity_{sample}_raw.fasta"
    params:
        outpath = "txomes/trinity_{sample}",
        outfasta = "txomes/trinity_{sample}/Trinity.fasta",
        dockeruser = dockeruser,
        sslibtype = lambda wildcards: config["samples"][wildcards.sample]["SS_lib_type"]
    threads:
        maxthreads
    shell:
        """
        docker run \
        -u $(id -u):$(id -g) --rm  \
        -v `pwd`:`pwd` \
        -v /etc/passwd:/etc/passwd \
        trinity_bowtie2.3.4.1 Trinity \
        --seqType fq \
        --left `pwd`/{input.f_paired} \
        --right `pwd`/{input.r_paired} \
        --max_memory 100G \
        --CPU {threads} \
        --trimmomatic \
        {params.sslibtype} \
        --output `pwd`/{params.outpath};
        mv {params.outfasta} {output.assemblypath}
        rm -rf {params.outpath}
        """

# illumina rule 3
rule correct_txome_names:
    input:
        assem = "txomes/raw/trinity_{sample}_raw.fasta"
    output:
        fassem = temp("txomes/singleline/singleline_trinity_{sample}.fasta")
    params:
        samplename = "{sample}"
    shell:
        """
        sed 's/^>TRINITY/>{params.samplename}/' {input.assem} > {output.fassem}
        """

# illumina rule 4
rule illumina_txome_single_line_to_multiline:
    """
    This turns the single line output of Trinity into a multi-line fasta
    file that can be indexed and used in something like bwa.
    """
    input:
        fassem = "txomes/singleline/singleline_trinity_{sample}.fasta"
    output:
        assem = "txomes/final/{sample}.fasta"
    run:
        singleline_to_multiline(input.fassem, output.assem)

# illumina rule 5
rule translate_txome:
    input:
        txome = "txomes/final/{sample}.fasta"
    output:
        outpeps = temp("{sample}.fasta.transdecoder_dir/longest_orfs.pep")
    shell:
        """
        TransDecoder.LongOrfs -t {input.txome}
        """
# illumina rule 5.1
rule trim_transdecoder_names:
    """
    This trims transdecoder names from an extended string like:
      >CHA_Caecosagitta_macrocephala_DS244_DN57546_c0_g1_i1.p1 type:internal len:122 gc:universal CHA_Caecosagitta_macrocephala_DS244_DN57546_c0_g1_i1:1-363(+)
    to:
      >CHA_Caecosagitta_macrocephala_DS244_DN57546_c0_g1_i1.p1
    """
    input:
        pepfile = "{sample}.fasta.transdecoder_dir/longest_orfs.pep"
    output:
        renamed = temp("pepfiles/temp/{sample}_longest_orfs_renamed.pep")
    shell:
        """
        bioawk -cfastx '{{printf(">%s\\n%s\\n", $name, $seq)}}' {input.pepfile} > {output.renamed}
        """

# illumina rule 6
rule move_and_multiline_peps:
    """
    This moves the translated txome to its final resting place and
    turns it from singleline to multiline.
    """
    input:
        pepfile = "pepfiles/temp/{sample}_longest_orfs_renamed.pep"
    output:
        pepfinal = "pepfiles/final/{sample}.pep"
    params:
        rmdir = lambda wildcards: "{}.fasta.transdecoder_dir".format(wildcards.sample),
        checkpoints = lambda wildcards: "{}.fasta.transdecoder_dir.__checkpoints_longorfs".format(wildcards.sample)
    run:
        singleline_to_multiline(input.pepfile, output.pepfinal)
        if os.path.exists(params.rmdir):
           shutil.rmtree(params.rmdir)
        if os.path.exists(params.checkpoints):
           shutil.rmtree(params.checkpoints)

# illumina rule 7 fasta softlink to db
rule softlink_fasta_data:
    input:
        assem = "txomes/final/{sample}.fasta"
    output:
        outlink = "/data/ncbi/db/{sample}.fasta"
    params:
        absln = lambda wildcards: "{}/txomes/final/{}.fasta".format(curr_dir, wildcards.sample)
    shell:
        """
        ln -s {params.absln} {output.outlink}
        """

# illumina rule 8 pep softlink to db
rule softlink_pep_data:
    input:
        pep = "pepfiles/final/{sample}.pep"
    output:
        outlink = "/data/ncbi/db/{sample}.pep"
    params:
        absln = lambda wildcards: "{}/pepfiles/final/{}.pep".format(curr_dir, wildcards.sample)
    shell:
        """
        ln -s {params.absln} {output.outlink}
        """

# illumina rule 9
rule nucl_db_data:
    input:
        inp = "/data/ncbi/db/{sample}.fasta"
    output:
        out = "/data/ncbi/db/{sample}.fasta.nhr"
    shell:
        """
        makeblastdb -in {input.inp} -input_type fasta -dbtype nucl -parse_seqids
        """

# illumina rule 10
rule prot_db_data:
    input:
        inp = "/data/ncbi/db/{sample}.pep"
    output:
        out = "/data/ncbi/db/{sample}.pep.phr"
    shell:
        """
        makeblastdb -in {input.inp} -input_type fasta -dbtype prot -parse_seqids
        """

# ILLUMINA rule 11
rule diamond_data:
    input:
        pepfile="/data/ncbi/db/{sample}.pep"
    output:
        ddb = "/data/ncbi/db/{sample}.dmnd"
    shell:
        """
        diamond makedb --in {input.pepfile} -d {output.ddb}
        """
# report
# this section handles writing a report on all of the samples.

# Fields to acquire:
#  - number of reads in untrimmed
#  - number of reads in trimmed
#  - number of transcripts
#  - number of peps

rule number_reads_untrimmed:
    input:
        raw = "reads/{sample}_f.fastq.gz"
    output:
        raw_count = "info/counts/raw/{sample}_raw_count.txt"
    shell:
        """
        echo $(bioawk -cfastx 'END{{print NR}}' {input.raw}) > {output.raw_count}
        """

rule number_reads_trimmed:
    input:
        trimmed = "trimmed/{sample}_f.trim.fastq.gz"
    output:
        trimmed_count = "info/counts/trimmed/{sample}_trimmed_count.txt"
    shell:
        """
        echo $(bioawk -cfastx 'END{{print NR}}' {input.trimmed}) > {output.trimmed_count}
        """

rule number_transcripts:
    input:
        fasta = "txomes/final/{sample}.fasta"
    output:
        fasta_counts = "info/counts/fasta/{sample}_fasta_count.txt"
    shell:
        """
        echo $(bioawk -cfastx 'END{{print NR}}' {input.fasta}) > {output.fasta_counts}
        """

rule number_peps:
    input:
        pep = "pepfiles/final/{sample}.pep"
    output:
        pep_count = "info/counts/peps/{sample}_pep_count.txt"
    shell:
        """
        echo $(bioawk -cfastx 'END{{print NR}}' {input.pep}) > {output.pep_count}
        """

rule calcN50_tx:
    input:
        fasta = "txomes/final/{sample}.fasta"
    output:
        fasta_N50 = "info/counts/fasta/{sample}_N50.txt"
    shell:
        """
        bioawk -cfastx '{{print(length($seq))}}' {input.fasta} | \
         sort -n | \
         awk '{{len[i++]=$1;sum+=$1}} \
              END {{for (j=0;j<i+1;j++) {{csum+=len[j]; \
              if (csum>=sum/2) {{print len[j];break}} }} }}' > {output.fasta_N50}
        """

rule calcN90_tx:
    input:
        fasta = "txomes/final/{sample}.fasta"
    output:
        fasta_N90 = "info/counts/fasta/{sample}_N90.txt"
    shell:
        """
        bioawk -cfastx '{{print(length($seq))}}' {input.fasta} | \
         sort -n | \
         awk '{{len[i++]=$1;sum+=$1}} \
              END {{for (j=0;j<i+1;j++) {{csum+=len[j]; \
              if (csum>=(sum*0.9)) {{print len[j];break}} }} }}' > {output.fasta_N90}
        """

rule calcN50_pep:
    input:
        pep = "pepfiles/final/{sample}.pep"
    output:
        pep_N50 = "info/counts/peps/{sample}_N50.txt"
    shell:
        """
        bioawk -cfastx '{{print(length($seq))}}' {input.pep} | \
         sort -n | \
         awk '{{len[i++]=$1;sum+=$1}} \
              END {{for (j=0;j<i+1;j++) {{csum+=len[j]; \
              if (csum>=sum/2) {{print len[j];break}} }} }}' > {output.pep_N50}
        """

rule quality_rejects:
    input:
        fasta = "txomes/final/{sample}.fasta"
    output:
        q_rejects = "info/trimming/txome/{sample}_qual_rejects.txt"
    shell:
        """
        echo "0" > {output.q_rejects}
        """

rule adapter_rejects:
    input:
        fasta = "txomes/final/{sample}.fasta"
    output:
        a_rejects = "info/trimming/txome/{sample}_adapter_rejects.txt"
    shell:
        """
        echo "0" > {output.a_rejects}
        """

rule GC_rejects:
    input:
        fasta = "txomes/final/{sample}.fasta"
    output:
        g_rejects = "info/trimming/txome/{sample}_gc_rejects.txt"
    shell:
        """
        echo "0" > {output.g_rejects}
        """

rule calcmean:
    input:
        fasta = "txomes/final/{sample}.fasta"
    output:
        fasta_meanlen = "info/counts/fasta/{sample}_meanlen.txt"
    shell:
        """
        bioawk -cfastx '{{sum += length($seq)}} \
          END{{printf("%.2f", sum/NR)}}' {input.fasta} > {output.fasta_meanlen}
        """

rule calcMB:
    input:
        fasta = "txomes/final/{sample}.fasta"
    output:
        fasta_numMB = "info/counts/fasta/{sample}_mb.txt"
    shell:
        """
        bioawk -cfastx '{{sum += length($seq)}} \
          END{{printf("%.2f", sum/1000000)}}' {input.fasta} > {output.fasta_numMB}
        """


rule collate_report:
    """ this method prints out the final qc report of all the libraries."""
    input:
        raw_num     = expand("info/counts/raw/{sample}_raw_count.txt", sample = config["samples"]),
        qual_reject = expand("info/trimming/txome/{sample}_qual_rejects.txt", sample = config["samples"]),
        adap_reject = expand("info/trimming/txome/{sample}_adapter_rejects.txt", sample = config["samples"]),
        gc_reject   = expand("info/trimming/txome/{sample}_gc_rejects.txt", sample = config["samples"]),
        trim_num    = expand("info/counts/trimmed/{sample}_trimmed_count.txt", sample = config["samples"]),
        txome_count = expand("info/counts/fasta/{sample}_fasta_count.txt", sample = config["samples"]),
        pep_count   = expand("info/counts/peps/{sample}_pep_count.txt", sample = config["samples"]),
        txome_N50   = expand("info/counts/fasta/{sample}_N50.txt", sample = config["samples"]),
        pep_N50     = expand("info/counts/peps/{sample}_N50.txt", sample = config["samples"]),
        tx_meanlen  = expand("info/counts/fasta/{sample}_meanlen.txt", sample = config["samples"]),
        tx_N90      = expand("info/counts/fasta/{sample}_N90.txt", sample = config["samples"]),
        tx_MB       = expand("info/counts/fasta/{sample}_mb.txt", sample = config["samples"]),

    output:
        "report/final_report.txt"
    run:
        finalout_handle = open(output[0], "w")
        #print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
        print("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}".format(
                        "sample",
                        "trim_efficiency",
                        "num_reads",
                        "num_reads_trimmed",
                        "quality_rejects",
                        "adapter_rejects",
                        "gc_rejects",
                        "transcripts_count",
                        "transcripts_meanlen",
                        "transcripts_N50",
                        "transcripts_N90",
                        "transcripts_MB",
                        "peps_count",
                        "peps_N50"),
              file = finalout_handle)
        for thiss in sorted(config["samples"]):
            raw_num     = read_number_from_file("info/counts/raw/{}_raw_count.txt".format(thiss))
            trim_num    = read_number_from_file("info/counts/trimmed/{}_trimmed_count.txt".format(thiss))
            txome_count = read_number_from_file("info/counts/fasta/{}_fasta_count.txt".format(thiss))
            pep_count   = read_number_from_file("info/counts/peps/{}_pep_count.txt".format(thiss))
            txome_N50   = read_number_from_file("info/counts/fasta/{}_N50.txt".format(thiss))
            pep_N50     = read_number_from_file("info/counts/peps/{}_N50.txt".format(thiss))
            qual_reject = read_number_from_file("info/trimming/txome/{}_qual_rejects.txt".format(thiss))
            adap_reject = read_number_from_file("info/trimming/txome/{}_adapter_rejects.txt".format(thiss))
            gc_reject   = read_number_from_file("info/trimming/txome/{}_gc_rejects.txt".format(thiss))
            tx_meanlen  = read_number_from_file("info/counts/fasta/{}_meanlen.txt".format(thiss))
            txome_N90   = read_number_from_file("info/counts/fasta/{}_N90.txt".format(thiss))
            txome_MB    = read_number_from_file("info/counts/fasta/{}_mb.txt".format(thiss))
            print("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}".format(
               thiss,
               int(trim_num)/int(raw_num),
               raw_num,
               trim_num,
               qual_reject,
               adap_reject,
               gc_reject,
               txome_count,
               tx_meanlen,
               txome_N50,
               txome_N90,
               txome_MB,
               pep_count,
               pep_N50
            ), file = finalout_handle)
        finalout_handle.close()
	samples:
	Thalassicolla_SRR7533167:
	f: "/rna_reads/SRA/SRR7533167_1.fastq.gz"
	r: "/rna_reads/SRA/SRR7533167_2.fastq.gz"
	SS_lib_type: "RF"
	library: "SRR7533167"
	GLO: "NA"
	"""
	Author: Darrin Schultz @conchoecia
	File: Transcriptome assembly, annotation, and db creation

	Instructions:
	- To run this script and all of its analyses: make sure that python 3,
	snakemake, and biopython are installed on your Unix computer.
	- Execute the following command: `snakemake --cores 45`, replacing `45`
	with the number of threads available on your machine.
	"""
	import os
	import sys
	from Bio import SeqIO
	import shutil

	configfile: "config.yaml"
	trimmomatic = "/usr/local/bin/Trimmomatic-0.35"
	maxthreads = 90
	dockeruser = "YOUR_USERNAME_HERE"
	curr_dir = os.getcwd()
	print(curr_dir)

	config["rna_f"] = {}
	config["rna_r"] = {}
	config["input_reads"] = []
	for sample in config["samples"]:
	config["input_reads"].append(config["samples"][sample]["f"])
	config["input_reads"].append(config["samples"][sample]["r"])

	# Now we need to change how the SS_lib_type_param is handled
	for sample in config["samples"]:
	thisss_lib = config["samples"][sample]["SS_lib_type"]
	if thisss_lib in ["None", "NA", "none", "no", "nein"]:
	config["samples"][sample]["SS_lib_type"] = ""
	elif thisss_lib in ["rf", "RF", "Rf", "rF"]:
	config["samples"][sample]["SS_lib_type"] = "--SS_lib_type RF"
	elif thisss_lib in ["fr", "FR", "Fr", "fR"]:
	config["samples"][sample]["SS_lib_type"] = "--SS_lib_type FR"

	def read_number_from_file(filename):
	with open(filename, "r") as f:
	for line in f:
	if line.strip():
	return line.strip()

	def singleline_to_multiline(infile, outfile):
	"""
	This function takes a fasta file and wraps it to N characters
	"""
	out_handle = open(outfile, 'w') #open outfile for writing

	with open(infile, "rU") as handle:
	for record in SeqIO.parse(handle, "fasta"):
	SeqIO.write(record, out_handle, "fasta")
	out_handle.close()

	rule all:
	input:
	#make the symlinks
	expand("reads/{sample}_f.fastq.gz", sample = config["samples"]),
	expand("reads/{sample}_r.fastq.gz", sample = config["samples"]),
	##trim the reads
	expand("trimmed/{sample}_{readtype}.trim.fastq.gz", sample = config["samples"], readtype = ["f", "r"]),
	#now assemble the transcriptomes
	expand("txomes/raw/trinity_{sample}_raw.fasta", sample = config["samples"]),
	# make it multi-line
	expand("txomes/final/{sample}.fasta", sample = config["samples"]),
	# transdecoder and translate, rename the pep files
	expand("pepfiles/final/{sample}.pep", sample = config["samples"]),
	# softlinks for /data/ncbi/db
	expand("/data/ncbi/db/{sample}.fasta", sample = config["samples"]),
	expand("/data/ncbi/db/{sample}.pep", sample = config["samples"]),
	# make the blast databases
	expand("/data/ncbi/db/{sample}.fasta.nhr", sample = config["samples"]),
	expand("/data/ncbi/db/{sample}.pep.phr", sample = config["samples"]),
	expand("/data/ncbi/db/{sample}.dmnd", sample = config["samples"]),
	# make the report
	expand("info/counts/raw/{sample}_raw_count.txt", sample = config["samples"]),
	expand("info/counts/trimmed/{sample}_trimmed_count.txt", sample = config["samples"]),
	expand("info/trimming/txome/{sample}_qual_rejects.txt", sample = config["samples"]),
	expand("info/trimming/txome/{sample}_adapter_rejects.txt", sample = config["samples"]),
	expand("info/trimming/txome/{sample}_gc_rejects.txt", sample = config["samples"]),
	expand("info/counts/fasta/{sample}_fasta_count.txt", sample = config["samples"]),
	expand("info/counts/peps/{sample}_pep_count.txt", sample = config["samples"]),
	expand("info/counts/fasta/{sample}_N50.txt", sample = config["samples"]),
	expand("info/counts/peps/{sample}_N50.txt", sample = config["samples"]),
	expand("info/counts/fasta/{sample}_meanlen.txt", sample = config["samples"]),
	expand("info/counts/fasta/{sample}_N90.txt", sample = config["samples"]),
	expand("info/counts/fasta/{sample}_mb.txt", sample = config["samples"]),
	"report/final_report.txt"

	###############################################################
	# __ __ ___ __ __ __ __ ___ __ __ _ __
	# \|__) \|__) \|__ \|__) \|__) / \ / ` \|__ /__` /__` \| \|\ \| / _`
	# \| \| \ \|___ \| \| \ \__/ \__, \|___ .__/ .__/ \| \| \\| \__>
	#
	# These are the rules for preprocessing the inputs in order
	# to prepare for the rest of the pipeline.

	# preprocessing rule 1
	rule make_symlink_f:
	output:
	illumina_f = "reads/{sample}_f.fastq.gz",
	run:
	if not os.path.exists("reads"):
	print("Making a directory called 'reads'.", file = sys.stderr)
	os.makedirs("reads")
	else:
	print("The 'reads' directory exists already.", file = sys.stderr)

	#make symlinks for the illumina reads
	print("Making read symlinks.", file =sys.stderr)
	print(" - Checking files for sample {}".format(wildcards.sample), file=sys.stderr)
	rna_f = "reads/{}_f.fastq.gz".format(wildcards.sample)
	if not os.path.exists(rna_f):
	print(" - Making a symlink for {}".format(rna_f), file=sys.stderr)
	reads_path = config["samples"][wildcards.sample]["f"]
	if not os.path.exists(reads_path):
	raise Exception("{} does not exist.".format(reads_path))
	os.symlink(reads_path, rna_f)
	else:
	print(" - A symlink already exists for {}".format(rna_f), file=sys.stderr)
	rule make_symlink_r:
	output:
	illumina_r = "reads/{sample}_r.fastq.gz",
	run:
	if not os.path.exists("reads"):
	print("Making a directory called 'reads'.", file = sys.stderr)
	os.makedirs("reads")
	else:
	print("The 'reads' directory exists already.", file = sys.stderr)

	#make symlinks for the illumina reads
	print("Making read symlinks.", file =sys.stderr)
	for sample in config["samples"]:
	print(" - Checking files for sample {}".format(wildcards.sample), file=sys.stderr)
	rna_r = "reads/{}_r.fastq.gz".format(wildcards.sample)

	if not os.path.exists(rna_r):
	print(" - Making a symlink for {}".format(rna_r), file=sys.stderr)
	reads_path = config["samples"][wildcards.sample]["r"]
	if not os.path.exists(reads_path):
	raise Exception("{} does not exist.".format(reads_path))
	os.symlink(reads_path, rna_r)
	else:
	print(" - A symlink already exists for {}".format(rna_r), file=sys.stderr)

	#
	# end of rules for preprocessing
	#
	###########################################################


	##############################################################
	# _ ____ _
	# (_) / /_ ______ ___ (_)___ ____ _
	# / / / / / / / __ `__ \/ / __ \/ __ `/
	# / / / / /_/ / / / / / / / / / / /_/ /
	# /_/_/_/\__,_/_/ /_/ /_/_/_/ /_/\__,_/
	#
	# These are the rules for working with the Illumina reads.

	# illumina rule 1
	rule trim_pairs:
	input:
	f = "reads/{sample}_f.fastq.gz",
	r = "reads/{sample}_r.fastq.gz",
	trim_jar = os.path.join(trimmomatic, "trimmomatic-0.35.jar"),
	adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2.fa")
	output:
	f = "trimmed/{sample}_f.trim.fastq.gz",
	r = "trimmed/{sample}_r.trim.fastq.gz",
	u1= temp("trimmed/{sample}_f.trim.unpaired.fastq.gz"),
	u2= temp("trimmed/{sample}_r.trim.unpaired.fastq.gz")
	threads:
	maxthreads
	shell:
	"""java -jar {input.trim_jar} PE \
	-phred33 -threads {threads} \
	{input.f} {input.r} \
	{output.f} \
	{output.u1} \
	{output.r} \
	{output.u2} \
	ILLUMINACLIP:{input.adapter_path}:2:30:10:1:TRUE \
	LEADING:3 TRAILING:3 \
	SLIDINGWINDOW:4:15 MINLEN:36"""

	# illumina rule 2
	rule assemble_txome:
	input:
	f_paired = "trimmed/{sample}_f.trim.fastq.gz",
	r_paired = "trimmed/{sample}_r.trim.fastq.gz"
	output:
	assemblypath = "txomes/raw/trinity_{sample}_raw.fasta"
	params:
	outpath = "txomes/trinity_{sample}",
	outfasta = "txomes/trinity_{sample}/Trinity.fasta",
	dockeruser = dockeruser,
	sslibtype = lambda wildcards: config["samples"][wildcards.sample]["SS_lib_type"]
	threads:
	maxthreads
	shell:
	"""
	docker run \
	-u $(id -u):$(id -g) --rm \
	-v `pwd`:`pwd` \
	-v /etc/passwd:/etc/passwd \
	trinity_bowtie2.3.4.1 Trinity \
	--seqType fq \
	--left `pwd`/{input.f_paired} \
	--right `pwd`/{input.r_paired} \
	--max_memory 100G \
	--CPU {threads} \
	--trimmomatic \
	{params.sslibtype} \
	--output `pwd`/{params.outpath};
	mv {params.outfasta} {output.assemblypath}
	rm -rf {params.outpath}
	"""

	# illumina rule 3
	rule correct_txome_names:
	input:
	assem = "txomes/raw/trinity_{sample}_raw.fasta"
	output:
	fassem = temp("txomes/singleline/singleline_trinity_{sample}.fasta")
	params:
	samplename = "{sample}"
	shell:
	"""
	sed 's/^>TRINITY/>{params.samplename}/' {input.assem} > {output.fassem}
	"""

	# illumina rule 4
	rule illumina_txome_single_line_to_multiline:
	"""
	This turns the single line output of Trinity into a multi-line fasta
	file that can be indexed and used in something like bwa.
	"""
	input:
	fassem = "txomes/singleline/singleline_trinity_{sample}.fasta"
	output:
	assem = "txomes/final/{sample}.fasta"
	run:
	singleline_to_multiline(input.fassem, output.assem)

	# illumina rule 5
	rule translate_txome:
	input:
	txome = "txomes/final/{sample}.fasta"
	output:
	outpeps = temp("{sample}.fasta.transdecoder_dir/longest_orfs.pep")
	shell:
	"""
	TransDecoder.LongOrfs -t {input.txome}
	"""
	# illumina rule 5.1
	rule trim_transdecoder_names:
	"""
	This trims transdecoder names from an extended string like:
	>CHA_Caecosagitta_macrocephala_DS244_DN57546_c0_g1_i1.p1 type:internal len:122 gc:universal CHA_Caecosagitta_macrocephala_DS244_DN57546_c0_g1_i1:1-363(+)
	to:
	>CHA_Caecosagitta_macrocephala_DS244_DN57546_c0_g1_i1.p1
	"""
	input:
	pepfile = "{sample}.fasta.transdecoder_dir/longest_orfs.pep"
	output:
	renamed = temp("pepfiles/temp/{sample}_longest_orfs_renamed.pep")
	shell:
	"""
	bioawk -cfastx '{{printf(">%s\\n%s\\n", $name, $seq)}}' {input.pepfile} > {output.renamed}
	"""

	# illumina rule 6
	rule move_and_multiline_peps:
	"""
	This moves the translated txome to its final resting place and
	turns it from singleline to multiline.
	"""
	input:
	pepfile = "pepfiles/temp/{sample}_longest_orfs_renamed.pep"
	output:
	pepfinal = "pepfiles/final/{sample}.pep"
	params:
	rmdir = lambda wildcards: "{}.fasta.transdecoder_dir".format(wildcards.sample),
	checkpoints = lambda wildcards: "{}.fasta.transdecoder_dir.__checkpoints_longorfs".format(wildcards.sample)
	run:
	singleline_to_multiline(input.pepfile, output.pepfinal)
	if os.path.exists(params.rmdir):
	shutil.rmtree(params.rmdir)
	if os.path.exists(params.checkpoints):
	shutil.rmtree(params.checkpoints)

	# illumina rule 7 fasta softlink to db
	rule softlink_fasta_data:
	input:
	assem = "txomes/final/{sample}.fasta"
	output:
	outlink = "/data/ncbi/db/{sample}.fasta"
	params:
	absln = lambda wildcards: "{}/txomes/final/{}.fasta".format(curr_dir, wildcards.sample)
	shell:
	"""
	ln -s {params.absln} {output.outlink}
	"""

	# illumina rule 8 pep softlink to db
	rule softlink_pep_data:
	input:
	pep = "pepfiles/final/{sample}.pep"
	output:
	outlink = "/data/ncbi/db/{sample}.pep"
	params:
	absln = lambda wildcards: "{}/pepfiles/final/{}.pep".format(curr_dir, wildcards.sample)
	shell:
	"""
	ln -s {params.absln} {output.outlink}
	"""

	# illumina rule 9
	rule nucl_db_data:
	input:
	inp = "/data/ncbi/db/{sample}.fasta"
	output:
	out = "/data/ncbi/db/{sample}.fasta.nhr"
	shell:
	"""
	makeblastdb -in {input.inp} -input_type fasta -dbtype nucl -parse_seqids
	"""

	# illumina rule 10
	rule prot_db_data:
	input:
	inp = "/data/ncbi/db/{sample}.pep"
	output:
	out = "/data/ncbi/db/{sample}.pep.phr"
	shell:
	"""
	makeblastdb -in {input.inp} -input_type fasta -dbtype prot -parse_seqids
	"""

	# ILLUMINA rule 11
	rule diamond_data:
	input:
	pepfile="/data/ncbi/db/{sample}.pep"
	output:
	ddb = "/data/ncbi/db/{sample}.dmnd"
	shell:
	"""
	diamond makedb --in {input.pepfile} -d {output.ddb}
	"""
	# report
	# this section handles writing a report on all of the samples.

	# Fields to acquire:
	# - number of reads in untrimmed
	# - number of reads in trimmed
	# - number of transcripts
	# - number of peps

	rule number_reads_untrimmed:
	input:
	raw = "reads/{sample}_f.fastq.gz"
	output:
	raw_count = "info/counts/raw/{sample}_raw_count.txt"
	shell:
	"""
	echo $(bioawk -cfastx 'END{{print NR}}' {input.raw}) > {output.raw_count}
	"""

	rule number_reads_trimmed:
	input:
	trimmed = "trimmed/{sample}_f.trim.fastq.gz"
	output:
	trimmed_count = "info/counts/trimmed/{sample}_trimmed_count.txt"
	shell:
	"""
	echo $(bioawk -cfastx 'END{{print NR}}' {input.trimmed}) > {output.trimmed_count}
	"""

	rule number_transcripts:
	input:
	fasta = "txomes/final/{sample}.fasta"
	output:
	fasta_counts = "info/counts/fasta/{sample}_fasta_count.txt"
	shell:
	"""
	echo $(bioawk -cfastx 'END{{print NR}}' {input.fasta}) > {output.fasta_counts}
	"""

	rule number_peps:
	input:
	pep = "pepfiles/final/{sample}.pep"
	output:
	pep_count = "info/counts/peps/{sample}_pep_count.txt"
	shell:
	"""
	echo $(bioawk -cfastx 'END{{print NR}}' {input.pep}) > {output.pep_count}
	"""

	rule calcN50_tx:
	input:
	fasta = "txomes/final/{sample}.fasta"
	output:
	fasta_N50 = "info/counts/fasta/{sample}_N50.txt"
	shell:
	"""
	bioawk -cfastx '{{print(length($seq))}}' {input.fasta} \| \
	sort -n \| \
	awk '{{len[i++]=$1;sum+=$1}} \
	END {{for (j=0;j<i+1;j++) {{csum+=len[j]; \
	if (csum>=sum/2) {{print len[j];break}} }} }}' > {output.fasta_N50}
	"""

	rule calcN90_tx:
	input:
	fasta = "txomes/final/{sample}.fasta"
	output:
	fasta_N90 = "info/counts/fasta/{sample}_N90.txt"
	shell:
	"""
	bioawk -cfastx '{{print(length($seq))}}' {input.fasta} \| \
	sort -n \| \
	awk '{{len[i++]=$1;sum+=$1}} \
	END {{for (j=0;j<i+1;j++) {{csum+=len[j]; \
	if (csum>=(sum*0.9)) {{print len[j];break}} }} }}' > {output.fasta_N90}
	"""

	rule calcN50_pep:
	input:
	pep = "pepfiles/final/{sample}.pep"
	output:
	pep_N50 = "info/counts/peps/{sample}_N50.txt"
	shell:
	"""
	bioawk -cfastx '{{print(length($seq))}}' {input.pep} \| \
	sort -n \| \
	awk '{{len[i++]=$1;sum+=$1}} \
	END {{for (j=0;j<i+1;j++) {{csum+=len[j]; \
	if (csum>=sum/2) {{print len[j];break}} }} }}' > {output.pep_N50}
	"""

	rule quality_rejects:
	input:
	fasta = "txomes/final/{sample}.fasta"
	output:
	q_rejects = "info/trimming/txome/{sample}_qual_rejects.txt"
	shell:
	"""
	echo "0" > {output.q_rejects}
	"""

	rule adapter_rejects:
	input:
	fasta = "txomes/final/{sample}.fasta"
	output:
	a_rejects = "info/trimming/txome/{sample}_adapter_rejects.txt"
	shell:
	"""
	echo "0" > {output.a_rejects}
	"""

	rule GC_rejects:
	input:
	fasta = "txomes/final/{sample}.fasta"
	output:
	g_rejects = "info/trimming/txome/{sample}_gc_rejects.txt"
	shell:
	"""
	echo "0" > {output.g_rejects}
	"""

	rule calcmean:
	input:
	fasta = "txomes/final/{sample}.fasta"
	output:
	fasta_meanlen = "info/counts/fasta/{sample}_meanlen.txt"
	shell:
	"""
	bioawk -cfastx '{{sum += length($seq)}} \
	END{{printf("%.2f", sum/NR)}}' {input.fasta} > {output.fasta_meanlen}
	"""

	rule calcMB:
	input:
	fasta = "txomes/final/{sample}.fasta"
	output:
	fasta_numMB = "info/counts/fasta/{sample}_mb.txt"
	shell:
	"""
	bioawk -cfastx '{{sum += length($seq)}} \
	END{{printf("%.2f", sum/1000000)}}' {input.fasta} > {output.fasta_numMB}
	"""



	rule collate_report:
	""" this method prints out the final qc report of all the libraries."""
	input:
	raw_num = expand("info/counts/raw/{sample}_raw_count.txt", sample = config["samples"]),
	qual_reject = expand("info/trimming/txome/{sample}_qual_rejects.txt", sample = config["samples"]),
	adap_reject = expand("info/trimming/txome/{sample}_adapter_rejects.txt", sample = config["samples"]),
	gc_reject = expand("info/trimming/txome/{sample}_gc_rejects.txt", sample = config["samples"]),
	trim_num = expand("info/counts/trimmed/{sample}_trimmed_count.txt", sample = config["samples"]),
	txome_count = expand("info/counts/fasta/{sample}_fasta_count.txt", sample = config["samples"]),
	pep_count = expand("info/counts/peps/{sample}_pep_count.txt", sample = config["samples"]),
	txome_N50 = expand("info/counts/fasta/{sample}_N50.txt", sample = config["samples"]),
	pep_N50 = expand("info/counts/peps/{sample}_N50.txt", sample = config["samples"]),
	tx_meanlen = expand("info/counts/fasta/{sample}_meanlen.txt", sample = config["samples"]),
	tx_N90 = expand("info/counts/fasta/{sample}_N90.txt", sample = config["samples"]),
	tx_MB = expand("info/counts/fasta/{sample}_mb.txt", sample = config["samples"]),

	output:
	"report/final_report.txt"
	run:
	finalout_handle = open(output[0], "w")
	#print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
	print("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}".format(
	"sample",
	"trim_efficiency",
	"num_reads",
	"num_reads_trimmed",
	"quality_rejects",
	"adapter_rejects",
	"gc_rejects",
	"transcripts_count",
	"transcripts_meanlen",
	"transcripts_N50",
	"transcripts_N90",
	"transcripts_MB",
	"peps_count",
	"peps_N50"),
	file = finalout_handle)
	for thiss in sorted(config["samples"]):
	raw_num = read_number_from_file("info/counts/raw/{}_raw_count.txt".format(thiss))
	trim_num = read_number_from_file("info/counts/trimmed/{}_trimmed_count.txt".format(thiss))
	txome_count = read_number_from_file("info/counts/fasta/{}_fasta_count.txt".format(thiss))
	pep_count = read_number_from_file("info/counts/peps/{}_pep_count.txt".format(thiss))
	txome_N50 = read_number_from_file("info/counts/fasta/{}_N50.txt".format(thiss))
	pep_N50 = read_number_from_file("info/counts/peps/{}_N50.txt".format(thiss))
	qual_reject = read_number_from_file("info/trimming/txome/{}_qual_rejects.txt".format(thiss))
	adap_reject = read_number_from_file("info/trimming/txome/{}_adapter_rejects.txt".format(thiss))
	gc_reject = read_number_from_file("info/trimming/txome/{}_gc_rejects.txt".format(thiss))
	tx_meanlen = read_number_from_file("info/counts/fasta/{}_meanlen.txt".format(thiss))
	txome_N90 = read_number_from_file("info/counts/fasta/{}_N90.txt".format(thiss))
	txome_MB = read_number_from_file("info/counts/fasta/{}_mb.txt".format(thiss))
	print("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}".format(
	thiss,
	int(trim_num)/int(raw_num),
	raw_num,
	trim_num,
	qual_reject,
	adap_reject,
	gc_reject,
	txome_count,
	tx_meanlen,
	txome_N50,
	txome_N90,
	txome_MB,
	pep_count,
	pep_N50
	), file = finalout_handle)
	finalout_handle.close()