afrendeiro/chipseq_pipeline.py

## chipseq_pipeline.py
#!/usr/bin/env python

"""
ChIP-seq pipeline
"""

from argparse import ArgumentParser
import os
import sys
import logging
import pandas as pd
import numpy as np
import time
import random
import re
import string
import textwrap
import shutil
import subprocess

# TODO: solve pandas chained assignments
pd.options.mode.chained_assignment = None

__author__ = "Andre Rendeiro"
__copyright__ = "Copyright 2014, Andre Rendeiro"
__credits__ = []
__license__ = "GPL2"
__version__ = "0.1"
__maintainer__ = "Andre Rendeiro"
__email__ = "arendeiro@cemm.oeaw.ac.at"
__status__ = "Development"


def main():
    # Parse command-line arguments
    parser = ArgumentParser(description="ChIP-seq pipeline.")

    # Global options
    # positional arguments
    # optional arguments
    parser.add_argument("-r", "--project-root", default="/fhgfs/groups/lab_bock/shared/projects/",
                        dest="project_root", type=str,
                        help="""Directory in which the project will reside.
                        Default=/fhgfs/groups/lab_bock/shared/projects/.""")
    parser.add_argument("--html-root", default="/fhgfs/groups/lab_bock/public_html/arendeiro/",
                        dest="html_root", type=str,
                        help="""public_html directory in which bigwig files for the project will reside.
                        Default=/fhgfs/groups/lab_bock/public_html/.""")
    parser.add_argument("--url-root", default="http://www.biomedical-sequencing.at/bocklab/arendeiro/",
                        dest="url_root", type=str,
                        help="""Url mapping to public_html directory where bigwig files for the project will be accessed.
                        Default=http://www.biomedical-sequencing.at/bocklab.""")
    parser.add_argument("--keep-tmp-files", dest="keep_tmp", action="store_true",
                        help="Keep intermediary files. If not it will only preserve final files. Default=False.")
    parser.add_argument("-c", "--cpus", default=4, dest="cpus",
                        help="Number of CPUs to use. Default=4.", type=int)
    parser.add_argument("-m", "--mem-per-cpu", default=4000, dest="mem",
                        help="Memory per CPU to use. Default=4000.", type=int)
    parser.add_argument("-q", "--queue", default="shortq", dest="queue",
                        choices=["develop", "shortq", "mediumq", "longq"],
                        help="Queue to submit jobs to. Default=shortq", type=str)
    parser.add_argument("-t", "--time", default="10:00:00", dest="time",
                        help="Maximum time for jobs to run. Default=10:00:00", type=str)
    parser.add_argument("--user-mail", default="", dest="user_mail",
                        help="User mail address. Default=<submitting user>.", type=str)
    parser.add_argument("-l", "--log-level", default="INFO", dest="log_level",
                        choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
                        help="Logging level. Default=INFO.", type=str)
    parser.add_argument("--dry-run", dest="dry_run", action="store_true",
                        help="Dry run. Assemble commands, but do not submit jobs to slurm. Default=False.")
    parser.add_argument("--no-checks", dest="checks", action="store_false",
                        help="Don't check file existence and integrity. Default=False.")

    # Sub commands
    subparser = parser.add_subparsers(title="sub-command", dest="command")

    # preprocess
    preprocess_subparser = subparser.add_parser("preprocess")
    preprocess_subparser.add_argument(dest="project_name", help="Project name.", type=str)
    preprocess_subparser.add_argument(dest="csv", help="CSV file with sample annotation.", type=str)
    preprocess_subparser.add_argument("-s", "--stage", default="all", dest="stage",
                                      choices=["all", "bam2fastq", "fastqc", "trim", "mapping",
                                               "shiftreads", "markduplicates", "removeduplicates",
                                               "indexbam", "maketracks", "coverage"],
                                      help="Run only these stages. Default=all.", type=str)
    preprocess_subparser.add_argument("--trimmer", default="skewer", choices=["trimmomatic", "skewer"],
                                      dest="trimmer", help="Trimmer to use. Default=skewer.", type=str)
    preprocess_subparser.add_argument("-i", "--max-insert-size", default=2000,
                                      dest="maxinsert",
                                      help="Maximum allowed insert size allowed for paired end mates. Default=2000.",
                                      type=int)
    preprocess_subparser.add_argument("--window-size", default=1000, dest="windowsize",
                                      help="Window size used for genome-wide correlations. Default=1000.",
                                      type=int)

    # stats
    stats_subparser = subparser.add_parser("stats")
    stats_subparser.add_argument(dest="project_name", help="Project name.", type=str)
    stats_subparser.add_argument(dest="csv", help="CSV file with sample annotation.", type=str)

    # analyse
    analyse_subparser = subparser.add_parser("analyse")
    analyse_subparser.add_argument(dest="project_name", help="Project name.", type=str)
    analyse_subparser.add_argument(dest="csv", help="CSV file with sample annotation.", type=str)
    analyse_subparser.add_argument("-s", "--stage", default="all", dest="stage",
                                   choices=["all", "qc", "callpeaks", "findmotifs", "centerpeaks",
                                            "annotatepeaks", "peakanalysis", "tssanalysis", "footprints", "frip"],
                                   help="Run only these stages. Default=all.", type=str)
    analyse_subparser.add_argument("--peak-caller", default="macs2", choices=["macs2", "spp"],
                                   dest="peak_caller", help="Peak caller to use. Default=macs2.", type=str)
    analyse_subparser.add_argument("--peak-window-width", default=2000,
                                   dest="peak_window_width",
                                   help="Width of window around peak motifs. Default=2000.",
                                   type=int)

    # compare
    compare_subparser = subparser.add_parser("compare")
    compare_subparser.add_argument(dest="project_name", help="Project name.", type=str)
    compare_subparser.add_argument(dest="csv", help="CSV file with sample annotation.", type=str)
    compare_subparser.add_argument("-s", "--stage", default="all", dest="stage",
                                   choices=["all", "diffbind", "correlations"],
                                   help="Run only these stages. Default=all.", type=str)
    compare_subparser.add_argument("--duplicates", dest="duplicates", action="store_true",
                                   help="Allow duplicates in coorelation analysis. Default=False.")
    compare_subparser.add_argument("--genome-window-width", default=1000,
                                   dest="genome_window_width",
                                   help="Width of window to make genome-wide correlations. Default=1000.",
                                   type=int)

    # Parse
    args = parser.parse_args()

    # Logging
    logger = logging.getLogger(__name__)
    levels = {
        "DEBUG": logging.DEBUG,
        "INFO": logging.INFO,
        "WARNING": logging.WARNING,
        "ERROR": logging.ERROR,
        "CRITICAL": logging.CRITICAL
    }
    logger.setLevel(levels[args.log_level])

    # create a file handler
    # (for now in current working dir, in the end copy log to projectDir)
    handler = logging.FileHandler(os.path.join(os.getcwd(), args.project_name + ".log"))
    handler.setLevel(logging.INFO)
    # format logger
    formatter = logging.Formatter(fmt='%(levelname)s: %(asctime)s - %(message)s', datefmt='%Y/%m/%d %H:%M:%S')
    handler.setFormatter(formatter)
    logger.addHandler(handler)

    # create a stout handler
    stdout = logging.StreamHandler(sys.stdout)
    stdout.setLevel(logging.ERROR)
    formatter = logging.Formatter(fmt='%(levelname)s: %(asctime)s - %(message)s', datefmt='%Y/%m/%d %H:%M:%S')
    stdout.setFormatter(formatter)
    logger.addHandler(stdout)

    # Start main function
    if args.command == "preprocess":
        logger, fr, to = preprocess(args, logger)
    elif args.command == "stats":
        logger, fr, to = readStats(args, logger)
    elif args.command == "analyse":
        logger, fr, to = analyse(args, logger)

    elif args.command == "compare":
        logger, fr, to = compare(args, logger)

    # Exit
    logger.info("Finished and exiting.")

    # Copy log to projectDir
    shutil.copy2(fr, to)

    sys.exit(1)


def checkProjectDirs(args, logger):
    # Directories and paths
    # check args.project_root exists and user has write access
    args.project_root = os.path.abspath(args.project_root)
    logger.debug("Checking if %s directory exists and is writable." % args.project_root)
    if not os.access(args.project_root, os.W_OK):
        logger.error("%s does not exist, or user has no write access.\n\
Use option '-r' '--project-root' to set a non-default project root path." % args.project_root)
        sys.exit(1)

        # check args.html_root exists and user has write access
    htmlDir = os.path.abspath(args.html_root)
    logger.debug("Checking if %s directory exists and is writable." % args.project_root)
    if not os.access(htmlDir, os.W_OK):
        logger.error("%s does not exist, or user has no write access.\n\
Use option '--html-root' to set a non-default html root path." % htmlDir)
        sys.exit(1)

    # project directories
    projectDir = os.path.join(args.project_root, args.project_name)
    dataDir = os.path.join(projectDir, "data")
    resultsDir = os.path.join(projectDir, "results")

    # make relative project dirs
    dirs = [
        projectDir,
        os.path.join(projectDir, "runs"),
        dataDir,
        os.path.join(dataDir, "fastq"),
        os.path.join(dataDir, "fastqc"),
        os.path.join(dataDir, "raw"),
        os.path.join(dataDir, "mapped"),
        os.path.join(dataDir, "coverage"),
        os.path.join(dataDir, "peaks"),
        os.path.join(dataDir, "motifs"),
        resultsDir,
        os.path.join(resultsDir, "plots"),
        os.path.join(resultsDir, "plots", "cdt"),
        os.path.join(resultsDir, "plots", "pickles"),
        htmlDir,
        os.path.join(args.html_root, args.project_name),
        os.path.join(args.html_root, args.project_name, "bigWig")
    ]
    for d in dirs:
        if not os.path.exists(d):
            os.makedirs(d)

    # chmod of paths to public_html folder
    html = [
        htmlDir,
        os.path.join(args.html_root, args.project_name),
        os.path.join(args.html_root, args.project_name, "bigWig")
    ]
    for d in html:
        try:
            os.chmod(d, 0755)
        except OSError:
            logger.error("cannot change folder's mode: %s" % d)
            continue

    htmlDir = os.path.join(args.html_root, args.project_name, "bigWig")
    urlRoot = args.url_root + args.project_name + "/bigWig/"

    return (htmlDir, projectDir, dataDir, resultsDir, urlRoot)


def getReplicates(samples):
    """
    Returns new sample annotation sheet with provided samples, plus biological replicates
    (merged technical replicates) and merged biological replicates.
    samples - a pandas.DataFrame with sample info.

    """
    # ignore some fields in the annotation sheet
    varsName = ["cellLine", "numberCells", "technique", "ip", "patient", "treatment", "biologicalReplicate", "technicalReplicate", "genome"]

    # get sample names
    samples["sampleName"] = ["_".join([str(j) for j in samples.ix[i][varsName]]) for i in samples.index]

    samplesMerged = samples.copy()

    # get merged technical replicates -> biological replicates
    variables = ["cellLine", "numberCells", "technique", "ip", "patient", "treatment", "biologicalReplicate", "genome"]

    for key, values in samples.groupby(variables).groups.items():
        rep = samples.ix[values][varsName].reset_index(drop=True).ix[0]
        if len(values) > 1:
            rep["experimentName"] = np.nan
            rep["technicalReplicate"] = 0
            rep["unmappedBam"] = samples.ix[values]["unmappedBam"].tolist()
            rep["sampleName"] = "_".join([str(i) for i in rep[varsName]])
            # append biological replicate to sample annotation
            samplesMerged = samplesMerged.append(rep, ignore_index=True)

    # get merged biological replicates -> merged biological replicates
    variables = ["cellLine", "numberCells", "technique", "ip", "patient", "treatment", "genome"]

    for key, values in samples.groupby(variables).groups.items():
        rep = samples.ix[values][varsName].reset_index(drop=True).ix[0]
        if len(values) > 1:
            # check samples in group are from different biological replicates
            if len(samples.ix[samples.groupby(variables).groups[key]]['biologicalReplicate'].unique()) > 1:
                rep["experimentName"] = np.nan
                rep["biologicalReplicate"] = 0
                rep["technicalReplicate"] = 0
                rep["unmappedBam"] = samples.ix[values]["unmappedBam"].tolist()
                rep["sampleName"] = "_".join([str(i) for i in rep[varsName]])
                # append biological replicate to sample annotation
                samplesMerged = samplesMerged.append(rep, ignore_index=True)

    # replace sample name with list of sample names for only one sample (original case)
    for i in range(len(samplesMerged)):
        if type(samplesMerged["unmappedBam"][i]) is not list:
            samplesMerged["unmappedBam"][i] = [samplesMerged["unmappedBam"][i]]

    return samplesMerged.sort(["sampleName"])


def preprocess(args, logger):
    """
    This takes unmapped Bam files and makes trimmed, aligned, indexed (and shifted if necessary)
    Bam files along with a UCSC browser track.
    """

    logger.info("Starting sample preprocessing.")

    logger.debug("Checking project directories exist and creating if not.")
    htmlDir, projectDir, dataDir, resultsDir, urlRoot = checkProjectDirs(args, logger)

    # Paths to static files on the cluster
    genomeFolder = "/fhgfs/prod/ngs_resources/genomes/"
    genomeIndexes = {
        "hg19": "/home/arendeiro/reference/Homo_sapiens/forBowtie2/hg19",
        "mm10": os.path.join(genomeFolder, "mm10/forBowtie2/mm10/index_woRandom/mm10"),
        "dr7": os.path.join(genomeFolder, "dr7/forBowtie2/dr7")
    }
    genomeSizes = {
        "hg19": "/fhgfs/groups/lab_bock/arendeiro/share/hg19.chrom.sizes",
        "mm10": "/fhgfs/groups/lab_bock/arendeiro/share/mm10.chrom.sizes",
        "dr7": "/fhgfs/groups/lab_bock/arendeiro/share/danRer7.chrom.sizes"
    }
    genomeWindows = {
        "hg19": "/fhgfs/groups/lab_bock/arendeiro/share/hg19.genomeWindows_1kb.bed",
        "mm10": "/fhgfs/groups/lab_bock/arendeiro/share/mm10.genomeWindows_1kb.bed",
        "dr7": "/fhgfs/groups/lab_bock/arendeiro/share/danRer7.genomeWindows_1kb.bed"
    }
    adapterFasta = "/fhgfs/groups/lab_bock/shared/cm.fa"

    # Other static info
    tagment = [
        "DNASE", "DNASESEQ", "DNASE-SEQ", "DHS", "DHS-SEQ", "DHSSEQ",
        "ATAC", "ATAC-SEQ", "ATACSEQ",
        "CM"
    ]

    # from the ggplot2 color blind pallete
    # #999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"
    # shortly: green,red=active, orange=transcription, blue=repression, yellow=potential
    colours = {
        "IGG": "153,153,153", "INPUT": "153,153,153",  # grey
        "H3K36ME1": "230,159,0", "H3K36ME2": "230,159,0", "H3K36ME3": "230,159,0",  # orange
        "H3K4ME3": "0,158,115",  # bluish green
        "H3K4ME1": "120,114,33", "H3K14ac": "120,114,33",  # yellow
        "H3K27ME1": "0,114,178", "H3K27ME2": "0,114,178", "H3K27ME3": "0,114,178",  # blue
        "H3K9ME1": "86,180,233", "H3K9ME2": "86,180,233", "H3K9ME3": "86,180,233",  # sky blue
        "H3AC": "213,94,0", "H3K9AC": "213,94,0", "H3K27AC": "213,94,0", "H3K56AC": "213,94,0", "H3K56AC": "213,94,0",  # vermillion
        "H3K79ME1": "204,121,167", "H3K79ME2": "204,121,167", "H3K79ME3": "204,121,167",  # reddish purple
        "ATAC": "0,158,115",
        "DNASE": "0,158,115",
    }

    colour_gradient = [  # 10 colour gradient from red to blue
        "155,3,5", "140,2,18", "125,2,31", "110,2,44", "96,2,57",
        "81,2,70", "66,2,83", "52,2,96", "37,2,109", "22,2,122"
    ]

    # Parse sample information
    args.csv = os.path.abspath(args.csv)

    # check if exists and is a file
    if not os.path.isfile(args.csv):
        logger.error("Sample annotation '%s' does not exist, or user has no read access." % args.csv)
        sys.exit(1)

    # read in
    samples = pd.read_csv(args.csv)

    # TODO:
    # Perform checks on the variables given
    # (e.g. genome in genomeIndexes)

    # start pipeline
    projectName = string.join([args.project_name, time.strftime("%Y%m%d-%H%M%S")], sep="_")

    # Get biological replicates and merged biological replicates
    logger.debug("Checking which samples should be merged.")
    samplesMerged = getReplicates(samples.copy())
    # ^^ this is the new annotation sheet as well
    samplesMerged['readType'] = None
    samplesMerged['readLength'] = None

    # add field for manual sample pairing
    samplesMerged["controlSampleName"] = None

    # Preprocess samples
    for sample in range(len(samplesMerged)):
        # Get sample name
        sampleName = samplesMerged["sampleName"][sample]

        # get jobname
        jobName = projectName + "_" + sampleName

        # check if sample is paired end
        PE = False

        # check if sample is tagmented or not:
        tagmented = True

        # get intermediate names for files
        if len(samplesMerged["unmappedBam"][sample]) == 1:
            unmappedBam = samplesMerged["unmappedBam"][sample][0]
        else:
            unmappedBam = os.path.join(dataDir, "raw", sampleName + ".bam")

        if not tagmented:
            bam = os.path.join(dataDir, "mapped", sampleName + ".trimmed.bowtie2")
        else:
            bam = os.path.join(dataDir, "mapped", sampleName + ".trimmed.bowtie2.shifted")

        # get colour for tracks
        if str(samplesMerged["ip"][sample]).upper() in colours.keys():
            colour = colours[samplesMerged["ip"][sample].upper()]
        else:
            if samplesMerged["technique"][sample] in ["ATAC", "ATACSEQ", "ATAC-SEQ"]:
                colour = colours["ATAC"]
            elif samplesMerged["technique"][sample] in ["DNASE", "DNASESEQ", "DNASE-SEQ"]:
                colour = colours["DNASE"]
            else:
                colour = random.sample(colour_gradient, 1)[0]  # pick one randomly

        # keep track of temporary files
        tempFiles = list()

        # assemble commands
        # get job header
        jobCode = slurmHeader(
            jobName=jobName,
            output=os.path.join(projectDir, "runs", jobName + ".slurm.log"),
            queue=args.queue,
            time=args.time,
            cpusPerTask=args.cpus,
            memPerCpu=args.mem,
            userMail=args.user_mail
        )

        if args.stage in ["all", "trim"]:
            # TODO:
            # Change absolute path to something usable by everyone or to an option.
            if args.trimmer == "trimmomatic":
                jobCode += trimmomatic(
                    inputFastq1=samplesMerged["unmappedBam"][sample],
                    inputFastq2=os.path.join(dataDir, "fastq", sampleName + ".2.fastq") if PE else None,
                    outputFastq1=os.path.join(dataDir, "fastq", sampleName + ".trimmed.fastq") if not PE else os.path.join(dataDir, "fastq", sampleName + ".1.trimmed.fastq"),
                    outputFastq1unpaired=os.path.join(dataDir, "fastq", sampleName + ".1_unpaired.trimmed.fastq") if PE else None,
                    outputFastq2=os.path.join(dataDir, "fastq", sampleName + ".2.trimmed.fastq") if PE else None,
                    outputFastq2unpaired=os.path.join(dataDir, "fastq", sampleName + ".2_unpaired.trimmed.fastq") if PE else None,
                    cpus=args.cpus,
                    adapters=adapterFasta,
                    log=os.path.join(resultsDir, sampleName + ".trimlog.txt")
                )
                if not PE:
                    tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".trimmed.fastq"))
                else:
                    tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".1.trimmed.fastq"))
                    tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".2.trimmed.fastq"))
                    tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".1_unpaired.trimmed.fastq"))
                    tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".2_unpaired.trimmed.fastq"))

            if args.trimmer == "skewer":
                jobCode += skewer(
                    inputFastq1=samplesMerged["unmappedBam"][sample] if not PE else os.path.join(dataDir, "fastq", sampleName + ".1.fastq"),
                    inputFastq2=os.path.join(dataDir, "fastq", sampleName + ".2.fastq") if PE else None,
                    outputPrefix=os.path.join(dataDir, "fastq", sampleName),
                    cpus=args.cpus,
                    adapters=adapterFasta
                )
                # move files to have common name
                if not PE:
                    jobCode += moveFile(
                        os.path.join(dataDir, "fastq", sampleName + "-trimmed.fastq"),
                        os.path.join(dataDir, "fastq", sampleName + ".trimmed.fastq")
                    )
                    tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".trimmed.fastq"))
                else:
                    jobCode += moveFile(
                        os.path.join(dataDir, "fastq", sampleName + "-trimmed-pair1.fastq"),
                        os.path.join(dataDir, "fastq", sampleName + ".1.trimmed.fastq")
                    )
                    jobCode += moveFile(
                        os.path.join(dataDir, "fastq", sampleName + "-trimmed-pair2.fastq"),
                        os.path.join(dataDir, "fastq", sampleName + ".2.trimmed.fastq")
                    )
                    tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".1.trimmed.fastq"))
                    tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".2.trimmed.fastq"))
                # move log to results dir
                jobCode += moveFile(
                    os.path.join(dataDir, "fastq", sampleName + "-trimmed.log"),
                    os.path.join(resultsDir, sampleName + ".trimlog.txt")
                )

        if args.stage in ["all", "mapping"]:
            if samplesMerged["genome"][sample] not in genomeIndexes:
                logger.error("Sample %s has unsuported genome index: %s" % (sampleName, samplesMerged["genome"][sample]))
                sys.exit(1)

            jobCode += bowtie2Map(
                inputFastq1=os.path.join(dataDir, "fastq", sampleName + ".1.trimmed.fastq") if PE else os.path.join(dataDir, "fastq", sampleName + ".trimmed.fastq"),
                inputFastq2=os.path.join(dataDir, "fastq", sampleName + ".2.trimmed.fastq") if PE else None,
                outputBam=os.path.join(dataDir, "mapped", sampleName + ".trimmed.bowtie2.bam"),
                log=os.path.join(resultsDir, sampleName + ".alnRates.txt"),
                metrics=os.path.join(resultsDir, sampleName + ".alnMetrics.txt"),
                genomeIndex=genomeIndexes[samplesMerged["genome"][sample]],
                maxInsert=args.maxinsert,
                cpus=args.cpus
            )
            tempFiles.append(os.path.join(dataDir, "mapped", sampleName + ".trimmed.bowtie2.bam"))
        if args.stage in ["all", "shiftreads"]:
            if tagmented:
                jobCode += shiftReads(
                    inputBam=os.path.join(dataDir, "mapped", sampleName + ".trimmed.bowtie2.bam"),
                    genome=samplesMerged["genome"][sample],
                    outputBam=bam + ".bam"
                )
        if args.stage in ["all", "markduplicates"]:
            jobCode += markDuplicates(
                inputBam=bam + ".bam",
                outputBam=bam + ".dups.bam",
                metricsFile=os.path.join(resultsDir, sampleName + ".duplicates.txt")  #
                # tempDir=
            )
        if args.stage in ["all", "removeduplicates"]:
            jobCode += removeDuplicates(
                inputBam=bam + ".dups.bam",
                outputBam=bam + ".nodups.bam",
                cpus=args.cpus
            )
        if args.stage in ["all", "indexbam"]:
            jobCode += indexBam(
                inputBam=bam + ".dups.bam"
            )
            jobCode += indexBam(
                inputBam=bam + ".nodups.bam"
            )
        if args.stage in ["all", "maketracks"]:
            # right now tracks are only made for bams with duplicates
            jobCode += bamToUCSC(
                inputBam=bam + ".dups.bam",
                outputBigWig=os.path.join(htmlDir, sampleName + ".bigWig"),
                genomeSizes=genomeSizes[samplesMerged["genome"][sample]],
                genome=samplesMerged["genome"][sample],
                tagmented=False
            )
            jobCode += addTrackToHub(
                sampleName=sampleName,
                trackURL=urlRoot + sampleName + ".bigWig",
                trackHub=os.path.join(htmlDir, "trackHub_{0}.txt".format(samplesMerged["genome"][sample])),
                colour=colour
            )
            if tagmented:
                jobCode += bamToUCSC(
                    inputBam=bam + ".dups.bam",
                    outputBigWig=os.path.join(htmlDir, sampleName + ".5prime.bigWig"),
                    genomeSizes=genomeSizes[samplesMerged["genome"][sample]],
                    genome=samplesMerged["genome"][sample],
                    tagmented=True
                )
                jobCode += addTrackToHub(
                    sampleName=sampleName,
                    trackURL=urlRoot + sampleName + ".5prime.bigWig",
                    trackHub=os.path.join(htmlDir, "trackHub_{0}.txt".format(samplesMerged["genome"][sample])),
                    fivePrime="5prime",
                    colour=colour
                )
            linkToTrackHub(
                trackHubURL="{0}/{1}/bigWig/trackHub_{2}.txt".format(args.url_root, args.project_name, samplesMerged["genome"][sample]),
                fileName=os.path.join(projectDir, "ucsc_tracks_{0}.html".format(samplesMerged["genome"][sample])),
                genome=samplesMerged["genome"][sample]
            )
        if args.stage in ["all", "coverage"]:
            jobCode += genomeWideCoverage(
                inputBam=bam + ".nodups.bam",
                genomeWindows="/fhgfs/groups/lab_bock/arendeiro/share/hg19.genomeWindows_1kb.filteredMappability50mer.filteredBlacklisted.bed",
                output=os.path.join(dataDir, "coverage", sampleName + ".cov")
            )

        # Remove intermediary files
        if args.stage == "all" and not args.keep_tmp:
            logger.debug("Removing intermediary files")
            for fileName in tempFiles:
                jobCode += removeFile(fileName)

        # Submit job to slurm
        # Get concatenated string with code from all modules
        jobCode += slurmFooter()

        # Output file name
        jobFile = os.path.join(projectDir, "runs", jobName + ".sh")

        with open(jobFile, 'w') as handle:
            handle.write(textwrap.dedent(jobCode))

        # Submit to slurm
        if not args.dry_run:
            logger.info("Submitting job to slurm: %s" % jobName)
            status = slurmSubmitJob(jobFile)

            if status != 0:
                logger.error("Slurm job '%s' not successfull" % jobFile)
                sys.exit(1)
            logger.debug("Project '%s'submission finished successfully." % args.project_name)

        #  Write location of bam file in merged samples annotation sheet
        samplesMerged['unmappedBam'][sample] = bam + ".dups.bam"

    # write original annotation sheet to project folder
    samples.to_csv(os.path.join(projectDir, args.project_name + ".annotation_sheet.csv"), index=False)
    # write annotation sheet with biological replicates to project folder
    samplesMerged.to_csv(
        os.path.join(projectDir, args.project_name + ".replicates.annotation_sheet.csv"),
        index=False
    )

    logger.debug("Finished preprocessing")

    return (logger,
            os.path.join(os.getcwd(), args.project_name + ".log"),
            os.path.join(projectDir, "runs", args.project_name + ".log"))


def readStats(args, logger):
    """
    Given an annotation sheet with replicates, gets number of reads mapped, duplicates, etc...
    """

    logger.info("Starting sample read stats.")

    logger.debug("Checking project directories exist and creating if not.")
    htmlDir, projectDir, dataDir, resultsDir, urlRoot = checkProjectDirs(args, logger)

    # Parse sample information
    args.csv = os.path.abspath(args.csv)

    # check if exists and is a file
    if not os.path.isfile(args.csv):
        logger.error("Sample annotation '%s' does not exist, or user has no read access." % args.csv)
        sys.exit(1)

    # read in
    samples = pd.read_csv(args.csv)

    variables = ["cellLine", "numberCells", "technique", "ip", "patient",
                 "treatment", "biologicalReplicate", "technicalReplicate", "genome"]

    cols = ["readCount", "unpaired", "unaligned", "unique", "multiple", "alignmentRate"]
    for col in cols:
        samples[col] = None

    cols = ["unpairedReadsExamined", "readPairsExamined", "unmappedReads", "unpairedReadDuplicates",
            "readPairDuplicates", "readPairOpticalDuplicates", "percentDuplication", "estimatedLibrarySize"]
    samples["totalReads"] = None
    for col in cols:
        samples[col] = None

    if "peakFile" in samples.columns:
        samples["NSC"] = None
        samples["RSC"] = None
        samples["peakNumber"] = None
        samples["FRiP"] = None

    for sample in xrange(len(samples)):
        # Sample name
        # if sampleName is not provided, use a concatenation of several variable (excluding longest)
        if str(samples["sampleName"][sample]) != "nan":
            sampleName = samples["sampleName"][sample]
        else:
            sampleName = string.join([str(samples[var][sample]) for var in variables], sep="_")
            logger.debug("No sample name provided, using concatenation of variables supplied")

        # Get alignment stats
        try:
            # get stats into a series
            alnStats = parseBowtieStats(os.path.join(resultsDir, sampleName + ".alnRates.txt"))
            # add to dataFrame
            samples.loc[sample, alnStats.index.tolist()] = alnStats
        except:
            logger.warn("Record with alignment rates is empty or not found for sample %s" % sampleName)
            pass

        # Get duplicate stats
        try:
            dups = pd.read_csv(
                os.path.join(resultsDir, sampleName + ".duplicates.txt"),
                sep="\t",
                comment="#",
                header=0
            )
            # drop if empty
            dups.dropna(thresh=len(dups.columns) - 1, inplace=True)

            # Add values to sample sheet
            for col in range(len(cols)):
                samples.loc[sample, cols[col]] = dups.drop(["LIBRARY"], axis=1).ix[0][col]
        except:
            logger.warn("Record with duplicates is empty or not found for sample %s" % sampleName)
            pass

        # Get NSC and RSC
        qcFile = os.path.join(resultsDir, "sample_QC.tsv")
        qc = parseQC(sampleName, qcFile)
        samples.loc[sample, "NSC"] = qc["NSC"]
        samples.loc[sample, "RSC"] = qc["RSC"]
        samples.loc[sample, "qualityTag"] = qc["qualityTag"]

        # Count peak number (if peaks in sheet)
        if "peakFile" in samples.columns:
            # and if sample has peaks
            if str(samples["peakFile"][sample]) != "nan":
                proc = subprocess.Popen(["wc", "-l", samples["peakFile"][sample]], stdout=subprocess.PIPE)
                out, err = proc.communicate()
                samples.loc[sample, "peakNumber"] = re.sub("\D.*", "", out)

        # Get FRiP from file (if exists) and add to sheet
        if "peakFile" in samples.columns:
            # if sample has peaks
            if str(samples["peakFile"][sample]) != "nan":
                try:
                    with open(os.path.join(resultsDir, sampleName + "_FRiP.txt"), "r") as handle:
                        content = handle.readlines()

                    readsInPeaks = int(re.sub("\D", "", content[0]))
                    mappedReads = samples["readCount"][sample] - samples["unaligned"][sample]

                    samples.loc[sample, "FRiP"] = readsInPeaks / mappedReads

                except:
                    logger.warn("Record with FRiP value is empty or not found for sample %s" % sampleName)
                    pass

    # convert duplicate rate to percentage
    samples.loc[:, 'percentDuplication'] = samples['percentDuplication'] * 100

    # write annotation sheet with biological replicates
    samples.to_csv(os.path.join(projectDir, args.project_name + ".read_stats.csv"), index=False)

    logger.debug("Finished getting read statistics.")

    return (logger,
            os.path.join(os.getcwd(), args.project_name + ".log"),
            os.path.join(projectDir, "runs", args.project_name + ".log"))


def analyse(args, logger):
    logger.info("Starting sample analysis.")

    logger.debug("Checking project directories exist and creating if not.")
    htmlDir, projectDir, dataDir, resultsDir, urlRoot = checkProjectDirs(args, logger)

    # Paths to static files on the cluster

    # Other static info
    tagment = [
        "DNASE", "DNASESEQ", "DNASE-SEQ", "DHS", "DHS-SEQ", "DHSSEQ",
        "ATAC", "ATAC-SEQ", "ATACSEQ",
        "CM"
    ]
    histones = ["H2A", "H2B", "H3", "H4"]
    broadFactors = [
        "H3K27ME1", "H3K27ME2", "H3K27ME3",
        "H3K36ME1", "H3K36ME2", "H3K36ME3",
        "H3K9ME1", "H3K9ME2", "H3K9ME3",
        "H3K72ME1", "H3K72ME2", "H3K72ME3"
    ]

    tssFiles = {
        "hg19": "/fhgfs/groups/lab_bock/arendeiro/share/GRCh37_hg19_refSeq.tss.bed",
        "mm10": "/fhgfs/groups/lab_bock/arendeiro/share/GRCm38_mm10_refSeq.tss.bed",
        "dr7": "/fhgfs/groups/lab_bock/arendeiro/share/GRCh37_hg19_refSeq.tss.bed"
    }

    # Parse sample information
    args.csv = os.path.abspath(args.csv)

    # check if exists and is a file
    if not os.path.isfile(args.csv):
        logger.error("Sample annotation '%s' does not exist, or user has no read access." % args.csv)
        sys.exit(1)

    # read in
    samples = pd.read_csv(args.csv)
    samples["controlSampleunmappedBam"] = None
    samples["peakFile"] = None

    # TODO:
    # Perform checks on the variables given
    # (e.g. columns existing, bams existing)

    # start pipeline
    projectName = string.join([args.project_name, time.strftime("%Y%m%d-%H%M%S")], sep="_")

    # Preprocess samples
    variables = ["cellLine", "numberCells", "technique", "ip", "patient",
                 "treatment", "biologicalReplicate", "technicalReplicate", "genome"]

    # track jobs to submit
    jobs = dict()

    for sample in xrange(len(samples)):
        # Sample name
        # if sampleName is not provided, use a concatenation of several variable (excluding longest)
        if str(samples["sampleName"][sample]) != "nan":
            sampleName = samples["sampleName"][sample]
        else:
            sampleName = string.join([str(samples[var][sample]) for var in variables], sep="_")
            logger.debug("No sample name provided, using concatenation of variables supplied")

        # Pair with control
        control = False
        # get file path of sample which this sample's controlName is pointing to
        ctrlField = samples[samples['sampleName'].isin([samples['controlSampleName'][sample]])]['unmappedBam']
        # ^^ doesn't mean it has something (i.e. is empty if no control sample is indicated)

        # if there is only one record, use that as control
        if len(ctrlField) == 1 and type(ctrlField.values[0]) == str:
            control = True
            controlName = samples.ix[sample]["controlSampleName"]
            controlBam = os.path.abspath(ctrlField.values[0])
            samples["controlSampleunmappedBam"][sample] = controlBam

        # skip samples without matched control
        if not control:
            continue

        # check if sample is tagmented or not:
        tagmented = True if samples["technique"][sample] in tagment else False

        if not tagmented:
            nodupsbam = os.path.join(dataDir, "mapped", sampleName + ".trimmed.bowtie2.nodups.bam")
        else:
            nodupsbam = os.path.join(dataDir, "mapped", sampleName + ".trimmed.bowtie2.shifted.nodups.bam")

        # Is it a histone?
        histone = True if any([i in samples["ip"][sample] for i in histones]) else False

        # Is it a broad factor?
        broad = False if samples["ip"][sample].upper() not in broadFactors else True

        # peakFile
        peakFile = os.path.join(
            dataDir, "peaks", sampleName,
            sampleName + "_peaks" + (".narrowPeak" if not broad else ".broadPeak")
        )
        samples["peakFile"][sample] = peakFile

        jobName = projectName + "_" + sampleName

        tempFiles = list()

        # assemble commands
        jobCode = slurmHeader(
            jobName=jobName,
            output=os.path.join(projectDir, "runs", jobName + ".slurm.log"),
            queue=args.queue,
            time=args.time,
            cpusPerTask=args.cpus,
            memPerCpu=args.mem,
            userMail=args.user_mail
        )
        if args.stage in ["all", "qc"]:
            jobCode += peakTools(
                inputBam=nodupsbam,
                output=os.path.join(resultsDir, "sample_QC.tsv"),
                plot=os.path.join(resultsDir, sampleName + "_QC.pdf"),
                cpus=args.cpus
            )
        if args.stage in ["all", "callpeaks"] and control:
            if args.peak_caller == "macs2":
                # make dir for output
                if not os.path.exists(os.path.join(dataDir, "peaks", sampleName)):
                    os.makedirs(os.path.join(dataDir, "peaks", sampleName))

                # For point-source factors use default settings
                # For broad factors use broad settings
                jobCode += macs2CallPeaks(
                    treatmentBam=os.path.abspath(samples['unmappedBam'][sample]),
                    controlBam=controlBam,
                    outputDir=os.path.join(dataDir, "peaks", sampleName),
                    sampleName=sampleName,
                    genome=samples['genome'][sample],
                    broad=True if broad else False
                )
            elif args.peak_caller == "spp":
                # For point-source factors use default settings
                # For broad factors use broad settings
                jobCode += sppCallPeaks(
                    treatmentBam=os.path.abspath(samples['unmappedBam'][sample]),
                    controlBam=controlBam,
                    treatmentName=sampleName,
                    controlName=controlName,
                    outputDir=os.path.join(dataDir, "peaks", sampleName),
                    broad=True if broad else False,
                    cpus=args.cpus
                )

        if args.stage in ["all", "findmotifs"] and control:
            # make dir for motifs
            if not os.path.exists(os.path.join(dataDir, "motifs", sampleName)):
                    os.makedirs(os.path.join(dataDir, "motifs", sampleName))

            if not histone:
                # For TFs, find the "self" motif
                jobCode += homerFindMotifs(
                    peakFile=peakFile,
                    genome=samples["genome"][sample],
                    outputDir=os.path.join(dataDir, "motifs", sampleName),
                    size="50",
                    length="8,10,12,14,16",
                    n_motifs=8
                )
                # For TFs, find co-binding motifs (broader region)
                jobCode += homerFindMotifs(
                    peakFile=peakFile,
                    genome=samples["genome"][sample],
                    outputDir=os.path.join(dataDir, "motifs", sampleName + "_cobinders"),
                    size="200",
                    length="8,10,12,14,16",
                    n_motifs=12
                )
            else:
                # For histones, use a broader region to find motifs
                jobCode += homerFindMotifs(
                    peakFile=peakFile,
                    genome=samples["genome"][sample],
                    outputDir=os.path.join(dataDir, "motifs", sampleName),
                    size="1000",
                    length="8,10,12,14,16",
                    n_motifs=20
                )

        if args.stage in ["all", "centerpeaks"] and control:
            # TODO:
            # right now this assumes peaks were called with MACS2
            # figure a way of magetting the peak files withough using the peak_caller option
            # for that would imply taht option would be required when selecting this stage
            jobCode += centerPeaksOnMotifs(
                peakFile=peakFile,
                genome=samples["genome"][sample],
                windowWidth=args.peak_window_width,
                motifFile=os.path.join(dataDir, "motifs", sampleName, "homerResults", "motif1.motif"),
                outputBed=os.path.join(dataDir, "peaks", sampleName, sampleName + "_peaks.motifCentered.bed")
            )

        if args.stage in ["all", "annotatepeaks"] and control:
            # TODO:
            # right now this assumes peaks were called with MACS2
            # figure a way of getting the peak files withough using the peak_caller option
            # for that would imply taht option would be required when selecting this stage
            jobCode += AnnotatePeaks(
                peakFile=peakFile,
                genome=samples["genome"][sample],
                motifFile=os.path.join(dataDir, "motifs", sampleName, "homerResults", "motif1.motif"),
                outputBed=os.path.join(dataDir, "peaks", sampleName, sampleName + "_peaks.motifAnnotated.bed")
            )

        if args.stage in ["all", "peakanalysis"] and control:
            jobCode += peakAnalysis(
                inputBam=os.path.abspath(samples.ix[sample]['unmappedBam']),
                peakFile=os.path.join(dataDir, "peaks", sampleName, sampleName + "_peaks.motifCentered.bed"),
                plotsDir=os.path.join(resultsDir, 'plots'),
                windowWidth=2000,
                fragmentsize=1 if tagmented else 50,  # change this to actual read length
                genome=samples.ix[sample]['genome'],
                n_clusters=5,
                strand_specific=True,
                duplicates=True
            )

        if args.stage in ["all", "tssanalysis"] and control:
            jobCode += tssAnalysis(
                inputBam=os.path.abspath(samples.ix[sample]['unmappedBam']),
                tssFile=tssFiles[samples.ix[sample]['genome']],
                plotsDir=os.path.join(resultsDir, 'plots'),
                windowWidth=2000,
                fragmentsize=1 if tagmented else 50,  # change this to actual read length
                genome=samples.ix[sample]['genome'],
                n_clusters=5,
                strand_specific=True,
                duplicates=True
            )
        if args.stage in ["all", "frip"]:
            jobCode += calculateFRiP(
                inputBam=os.path.abspath(samples['unmappedBam'][sample]),
                inputBed=peakFile,
                output=os.path.join(resultsDir, sampleName + "_FRiP.txt"),
            )

        # if args.stage in ["all", "footprints"] and control and tagmented:
        #     jobCode += footprintAnalysis(
        #         os.path.join(dataDir, "peaks", sampleName, sampleName + "_peaks.motifCentered.bed"),
        #         os.path.join(dataDir, "peaks", sampleName, sampleName + "_peaks.motifAnnotated.bed")
        #     )

        # finish job
        jobCode += slurmFooter()
        jobs[jobName] = jobCode

    # Submit jobs to slurm
    for jobName, jobCode in jobs.items():
        # Output file name
        jobFile = os.path.join(projectDir, "runs", jobName + ".sh")

        with open(jobFile, 'w') as handle:
            handle.write(textwrap.dedent(jobCode))

        # Submit to slurm
        if not args.dry_run:
            logger.info("Submitting jobs to slurm")
            status = slurmSubmitJob(jobFile)

            if status != 0:
                logger.error("Slurm job '%s' not successfull" % jobFile)
                sys.exit(1)
            logger.debug("Project '%s'submission finished successfully." % args.project_name)

    # write original annotation sheet to project folder
    samples.to_csv(
        os.path.join(projectDir, args.project_name + ".replicates.annotation_sheet.csv"),
        index=False
    )

    logger.debug("Finished comparison")

    return (logger,
            os.path.join(os.getcwd(), args.project_name + ".log"),
            os.path.join(projectDir, "runs", args.project_name + ".log"))


def compare(args, logger):
    logger.info("Starting sample comparison.")

    logger.debug("Checking project directories exist and creating if not.")
    htmlDir, projectDir, dataDir, resultsDir, urlRoot = checkProjectDirs(args, logger)

    # Paths to static files on the cluster

    # Other static info

    # Parse sample information
    args.csv = os.path.abspath(args.csv)

    # check if exists and is a file
    if not os.path.isfile(args.csv):
        logger.error("Sample annotation '%s' does not exist, or user has no read access." % args.csv)
        sys.exit(1)

    # read in
    samples = pd.read_csv(args.csv)

    # TODO:
    # Perform checks on the variables given
    # (e.g. columns existing, bams/peaks existing)

    # start pipeline
    projectName = string.join([args.project_name, time.strftime("%Y%m%d-%H%M%S")], sep="_")

    # track jobs to submit
    jobs = dict()

    # diffBind
    if args.stage in ["all", "diffbind"]:

        # Submit separate jobs for each genome
        genome_groups = samples.groupby(["genome"]).groups

        for genome in genome_groups.keys():
            # Separate comparison per IPed factor
            df = samples.ix[genome_groups[genome]]
            IP_groups = df.groupby(["ip"]).groups

            for IP in IP_groups.keys():
                if IP.upper() in ["INPUT", "IGG"]:  # skip groups with control
                    continue

                jobName = projectName + "_" + "diffBind_{0}_{1}".format(genome, IP)
                diffBindSheetFile = os.path.join(projectDir, "runs", jobName + ".csv")
                print(diffBindSheetFile)

                # make diffBind csv file, save it
                empty = makeDiffBindSheet(
                    samples,
                    samples.ix[IP_groups[IP]],
                    os.path.join(dataDir, "peaks"),
                    diffBindSheetFile
                )
                # ^^ returns Bool for empty diffBind sheet

                if not empty:
                    # create job
                    jobCode = slurmHeader(
                        jobName=jobName,
                        output=os.path.join(projectDir, "runs", jobName + ".slurm.log"),
                        queue=args.queue,
                        time=args.time,
                        cpusPerTask=args.cpus,
                        memPerCpu=args.mem,
                        userMail=args.user_mail
                    )
                    jobCode += diffBind(
                        inputCSV=diffBindSheetFile,
                        jobName=jobName,
                        plotsDir=os.path.join(resultsDir, 'plots')
                    )
                    jobCode += slurmFooter()
                    jobs[jobName] = jobCode

    # Submit job for all samples together (per genome)
    if args.stage in ["all", "correlations"]:
        # Submit separate jobs for each genome
        genome_groups = samples.groupby(["genome"]).groups

        for genome in genome_groups.keys():
            jobName = projectName + "_" + genome + "_sample_correlations"

            # Separate comparison per IPed factor
            df = samples.ix[genome_groups[genome]]

            jobCode = slurmHeader(
                jobName=jobName,
                output=os.path.join(projectDir, "runs", jobName + ".slurm.log"),
                queue=args.queue,
                time=args.time,
                cpusPerTask=args.cpus,
                memPerCpu=args.mem,
                userMail=args.user_mail
            )
            jobCode += plotCorrelations(
                inputCoverage=[os.path.join(dataDir, "coverage", sampleName + ".cov") for sampleName in list(df['sampleName'])],
                plotsDir=os.path.join(resultsDir, 'plots')
            )
            jobCode += slurmFooter()
            jobs[jobName] = jobCode

    # Submit jobs to slurm
    for jobName, jobCode in jobs.items():
        # Output file name
        jobFile = os.path.join(projectDir, "runs", jobName + ".sh")

        with open(jobFile, 'w') as handle:
            handle.write(textwrap.dedent(jobCode))

        # Submit to slurm
        if not args.dry_run:
            logger.info("Submitting jobs to slurm")
            status = slurmSubmitJob(jobFile)

            if status != 0:
                logger.error("Slurm job '%s' not successfull" % jobFile)
                sys.exit(1)
            logger.debug("Project '%s'submission finished successfully." % args.project_name)

    logger.debug("Finished comparison")

    return (logger,
            os.path.join(os.getcwd(), args.project_name + ".log"),
            os.path.join(projectDir, "runs", args.project_name + ".log"))


def getReadType(bamFile, n=10):
    """
    Gets the read type (single, paired) and length of bam file.
    Returns tuple of (readType=string, readLength=int).
    """
    from collections import Counter
    p = subprocess.Popen(['samtools', 'view', bamFile], stdout=subprocess.PIPE)

    # Count paired alignments
    paired = 0
    readLength = Counter()
    while n > 0:
        line = p.stdout.next().split("\t")
        flag = int(line[1])
        readLength[len(line[9])] += 1
        if 1 & flag:  # check decimal flag contains 1 (paired)
            paired += 1
        n -= 1
    p.kill()

    # Get most abundant read readLength
    readLength = sorted(readLength)[-1]

    # If at least half is paired, return True
    if paired > (n / 2):
        return ("PE", readLength)
    else:
        return ("SE", readLength)


def parseBowtieStats(statsFile):
    """
    Parses Bowtie2 stats file, returns series with values.
    """
    stats = pd.Series(index=["readCount", "unpaired", "unaligned", "unique", "multiple", "alignmentRate"])

    with open(statsFile) as handle:
        content = handle.readlines()  # list of strings per line

    # total reads
    try:
        line = [i for i in range(len(content)) if " reads; of these:" in content[i]][0]
        stats["readCount"] = re.sub("\D.*", "", content[line])
        line = [i for i in range(len(content)) if "were unpaired; of these:" in content[i]][0]
        stats["unpaired"] = re.sub("\D", "", re.sub("\(.*", "", content[line]))
        line = [i for i in range(len(content)) if "aligned 0 times" in content[i]][0]
        stats["unaligned"] = re.sub("\D", "", re.sub("\(.*", "", content[line]))
        line = [i for i in range(len(content)) if "aligned exactly 1 time" in content[i]][0]
        stats["unique"] = re.sub("\D", "", re.sub("\(.*", "", content[line]))
        line = [i for i in range(len(content)) if "aligned >1 times" in content[i]][0]
        stats["multiple"] = re.sub("\D", "", re.sub("\(.*", "", content[line]))
        line = [i for i in range(len(content)) if "overall alignment rate" in content[i]][0]
        stats["alignmentRate"] = re.sub("\D.*", "", content[line])
    except IndexError:
        pass
    return stats


def parseMACSModel(modelFile):
    """
    Parses Bowtie2 stats file, returns series with values.
    """
    out = dict()

    with open(modelFile) as handle:
        content = handle.readlines()  # list of strings per line

    try:
        patt = re.compile("\((.+)\)")
        line = [i for i in range(len(content)) if "p <- c(" in content[i]][0]
        out["p"] = [float(i) for i in patt.search(content[line]).group(1).split(",")]
        line = [i for i in range(len(content)) if "m <- c(" in content[i]][0]
        out["m"] = [float(i) for i in patt.search(content[line]).group(1).split(",")]
        line = [i for i in range(len(content)) if "ycorr <- c(" in content[i]][0]
        out["ycorr"] = [float(i) for i in patt.search(content[line]).group(1).split(",")]
        line = [i for i in range(len(content)) if "xcorr <- c(" in content[i]][0]
        out["xcorr"] = [float(i) for i in patt.search(content[line]).group(1).split(",")]
        line = [i for i in range(len(content)) if "altd  <- c(" in content[i]][0]
        out["altd"] = [float(i) for i in patt.search(content[line]).group(1).split(",")]
    except IndexError:
        pass
    return out


def parseQC(sampleName, qcFile):
    """
    Parses QC table produced by phantompeakqualtools (spp) and adds to sample annotation sheet
    sample quality metrics for each sample.
    """
    series = pd.Series(index=["NSC", "RSC", "qualityTag"])
    try:
        with open(qcFile) as handle:
            content = handle.readlines()  # list of strings per line
    except:
        return series

    try:
        line = [i for i in range(len(content)) if str(sampleName) in content[i]][0]
        attrs = content[line].strip().split("\t")
        series["NSC"] = attrs[-3]
        series["RSC"] = attrs[-2]
        series["qualityTag"] = attrs[-1]
    except IndexError:
        pass
    return series


def makeDiffBindSheet(samples, df, peaksDir, sheetFile):
    """
    Prepares and saves a diffBind annotation sheet from a pandas.DataFrame annotation with standard columns.
    """
    # Exclude samples without matched control
    df = df[df["controlSampleName"].notnull()]

    # Convert numerical fields to str
    for col in ["numberCells", "technique", "treatment", "patient"]:
        df[col] = df[col].apply(str)

    # Merge some annotation fields
    df.loc[:, "condition"] = df["numberCells"] + "_" + df["technique"]
    df.loc[:, "treatment"] = df["treatment"] + "_" + df["patient"]

    # Complete rest of annotation
    df.loc[:, "ControlID"] = df['controlSampleName']
    df.loc[:, "bamControl"] = [samples[samples['sampleName'] == ctrl]["unmappedBam"].tolist()[0] for ctrl in df['controlSampleName']]
    # TODO:
    # Change peak path according to narrow/broad
    df.loc[:, "Peaks"] = [os.path.join(peaksDir, sampleName, sampleName + "_peaks.narrowPeak") for sampleName in df["sampleName"]]
    df["PeakCaller"] = "macs"

    # Filter columns used
    df = df[["sampleName", "cellLine", "ip", "condition", "treatment",
             "biologicalReplicate", "unmappedBam", "ControlID", "bamControl", "Peaks", "PeakCaller"]]
    # Rename columns
    df.columns = ["SampleID", "Tissue", "Factor", "Condition", "Treatment",
                  "Replicate", "bamReads", "ControlID", "bamControl", "Peaks", "PeakCaller"]

    # If resulting table is not empty, write to disk
    if not df.empty:
        # save as csv
        df.to_csv(sheetFile, index=False)  # check if format complies with DiffBind

    # Return bool of empty
    return df.empty


def slurmHeader(jobName, output, queue="shortq", ntasks=1, time="10:00:00",
                cpusPerTask=16, memPerCpu=2000, nodes=1, userMail=""):
    command = """    #!/bin/bash
    #SBATCH --partition={0}
    #SBATCH --ntasks={1}
    #SBATCH --time={2}

    #SBATCH --cpus-per-task={3}
    #SBATCH --mem-per-cpu={4}
    #SBATCH --nodes={5}

    #SBATCH --job-name={6}
    #SBATCH --output={7}

    #SBATCH --mail-type=end
    #SBATCH --mail-user={8}

    # Start running the job
    hostname
    date

    """.format(queue, ntasks, time, cpusPerTask, memPerCpu,
               nodes, jobName, output, userMail)

    return command


def slurmFooter():
    command = """
    # Job end
    date

    """

    return command


def slurmSubmitJob(jobFile):
    command = "sbatch %s" % jobFile

    return os.system(command)


def removeFile(fileName):
    command = """
    # Removing file
    rm {0}
    """.format(fileName)

    return command


def moveFile(old, new):
    command = """
    # Moving file

    mv {0} {1}
    """.format(old, new)

    return command


def makeDir(directory):
    command = """
    # Removing file

    mkdir -p {0}
    """.format(directory)

    return command


def mergeBams(inputBams, outputBam):
    command = """
    # Merging bam files from replicates
    echo "Merging bam files from replicates"

    java -Xmx4g -jar /cm/shared/apps/picard-tools/1.118/MergeSamFiles.jar \\
    USE_THREADING=TRUE \\
    {1} \\
    OUTPUT={0}
    """.format(outputBam, (" ".join(["INPUT=%s"] * len(inputBams))) % tuple(inputBams))

    return command


def fastqc(inputBam, outputDir):
    command = """
    # Measuring sample quality with Fastqc
    echo "Measuring sample quality with Fastqc"
    module load java/jdk/1.7.0_65
    module load FastQC/0.11.2

    fastqc --noextract \\
    --outdir {0} \\
    {1}

    """.format(outputDir, inputBam)

    return command


def bam2fastq(inputBam, outputFastq, outputFastq2=None, unpairedFastq=None):
    command = """
    # Convert to Fastq format
    echo "Converting to Fastq format"

    java -Xmx4g -jar /cm/shared/apps/picard-tools/1.118/SamToFastq.jar \\
    INPUT={0} \\
    """.format(inputBam)
    if outputFastq2 is None and unpairedFastq is None:
        command += """FASTQ={0}

    """.format(outputFastq)
    else:
        command += """FASTQ={0} \\
    SECOND_END_FASTQ={1} \\
    UNPAIRED_FASTQ={2}

    """.format(outputFastq, outputFastq2, unpairedFastq)

    return command


def trimmomatic(inputFastq1, outputFastq1, cpus, adapters, log,
                inputFastq2=None, outputFastq1unpaired=None,
                outputFastq2=None, outputFastq2unpaired=None):

    PE = False if inputFastq2 is None else True
    pe = "PE" if PE else "SE"

    command = """
    # Trimming adapters from sample
    echo "Trimming adapters from sample"
    module load trimmomatic/0.32

    java -Xmx4g -jar `which trimmomatic-0.32.jar` """

    command += """{0} \\
    -threads {1} \\
    -trimlog {2} \\
    {3} \\
    """.format(pe, cpus, log, inputFastq1)
    if PE:
        command += """\\
    {0} \\
    """.format(inputFastq2)
    command += """\\
    {0} \\
    """.format(outputFastq1)
    if PE:
        command += """\\
    {0} \\
    {1} \\
    {2} \\
    """.format(outputFastq1unpaired,
               outputFastq2, outputFastq2unpaired)
    command += """\\
    ILLUMINACLIP:{0}:1:40:15:8:true \\
    LEADING:3 TRAILING:3 \\
    SLIDINGWINDOW:4:10 \\
    MINLEN:36

    """.format(adapters)

    return command


def skewer(inputFastq1, outputPrefix, cpus, adapters, inputFastq2=None):

    PE = False if inputFastq2 is None else True
    mode = "pe" if PE else "any"

    command = """
    # Trimming adapters from sample
    echo "Trimming adapters from sample"

    skewer --quiet \\
    -t {0} \\
    -m {1} \\
    -x {2} \\
    -o {3} \\
    {4} """.format(cpus, mode, adapters, outputPrefix, inputFastq1)

    if inputFastq2 is not None:
        command += """\\
    {0}

    """.format(inputFastq2)

    return command


def bowtie2Map(inputFastq1, outputBam, log, metrics, genomeIndex, maxInsert, cpus, inputFastq2=None):
    """

    """
    outputBam = re.sub("\.bam$", "", outputBam)
    # Admits 2000bp-long fragments (--maxins option)
    command = """
    # Mapping reads with Bowtie2
    echo "Mapping reads with Bowtie2"
    module load bowtie/2.2.3
    module load samtools

    bowtie2 --very-sensitive -p {0} \\
    -x {1} \\
    --met-file {2} \\
    """.format(cpus, genomeIndex, metrics)
    if inputFastq2 is None:
        command += "{0} ".format(inputFastq1)
    else:
        command += """--maxins {0} -1 {1} \\
    -2 {2} """.format(maxInsert, inputFastq1, inputFastq2)
    command += """\\
    2> {0} | \\
    samtools view -S -b - | \\
    samtools sort - {1}

    """.format(log, outputBam)

    return command


def shiftReads(inputBam, genome, outputBam):
    outputBam = re.sub("\.bam$", "", outputBam)
    # TODO:
    # Implement read shifting with HTSeq or Cython
    command = """
    # Shifting read of tagmented sample
    echo "Shifting read of tagmented sample"
    module load samtools
    module load python

    samtools view -h {0} | \\
    python {3}/lib/shift_reads.py {1} | \\
    samtools view -S -b - | \\
    samtools sort - {2}

    """.format(inputBam, genome, outputBam,
               os.path.abspath(os.path.dirname(os.path.realpath(__file__))))

    return command


def markDuplicates(inputBam, outputBam, metricsFile, tempDir="."):
    transientFile = re.sub("\.bam$", "", outputBam) + ".dups.nosort.bam"
    outputBam = re.sub("\.bam$", "", outputBam)
    command = """
    # Marking duplicates with piccard
    echo "Mark duplicates with piccard"
    module load samtools

    java -Xmx4g -jar /cm/shared/apps/picard-tools/1.118/MarkDuplicates.jar \\
    INPUT={0} \\
    OUTPUT={1} \\
    METRICS_FILE={2} \\
    VALIDATION_STRINGENCY=LENIENT \\
    TMP_DIR={3}

    # Sort bam file with marked duplicates
    samtools sort {1} \\
    {4}

    if [[ -s {1} ]]
        then
        rm {1}
    fi

    """.format(inputBam, transientFile, metricsFile, tempDir, outputBam)

    return command


def removeDuplicates(inputBam, outputBam, cpus=16):
    command = """
    # Removing duplicates with sambamba
    echo "Remove duplicates with sambamba"
    sambamba markdup -t {2} -r \\
    {0} \\
    {1}

    """.format(inputBam, outputBam, cpus)

    return command


def indexBam(inputBam):
    command = """
    # Indexing bamfile with samtools
    echo "Indexing bamfile with samtools"
    module load samtools

    samtools index {0}

    """.format(inputBam)

    return command


def peakTools(inputBam, output, plot, cpus):
    command = """
    # Get sample QC metrics
    echo "Getting sample QC metrics"

    module load boost/1.57
    module load gcc/4.8.2
    module load openmpi/gcc/64/1.8.1

    run_spp.R -rf -savp -savp={2} -s=0:5:500 \\
    -c={0} -out={1}

    """.format(inputBam, output, plot)

    return command


def qc():
    """
    $PRESEQ c_curve -B $PROJECTDIR/mapped/$SAMPLE_NAME.trimmed.bowtie2.sorted.shifted.dup.bam \
    -o $PROJECTDIR/mapped/$SAMPLE_NAME_qc_c_curve.txt

    $PRESEQ lc_extrap -e 1e8 -s 2e6 -B $PROJECTDIR/mapped/$SAMPLE_NAME.trimmed.bowtie2.sorted.shifted.dup.bam \
    -o $PROJECTDIR/mapped/$SAMPLE_NAME_qc_lc_extrap.txt
    """
    raise NotImplementedError("Function not implemented yet.")


def bamToUCSC(inputBam, outputBigWig, genomeSizes, genome, tagmented=False):
    transientFile = os.path.abspath(re.sub("\.bigWig", "", outputBigWig))
    if not tagmented:
        # TODO:
        # addjust fragment length dependent on read size and real fragment size
        # (right now it asssumes 50bp reads with 180bp fragments)
        command = """
    # Making bigWig tracks from bam file
    echo "making bigWig tracks from bam file"
    module load bedtools

    bedtools bamtobed -i {0} | \\
    bedtools slop -i stdin -g {1} -s -l 0 -r 130 | \\
    python {5}/lib/fix_bedfile_genome_boundaries.py {4} | \\
    genomeCoverageBed -i stdin -bg -g {1} \\
    > {2}.cov

    bedGraphToBigWig {2}.cov \\
    {1} \\
    {3}

    # remove cov file
    if [[ -s {2}.cov ]]
        then
        rm {2}.cov
    fi

    chmod 755 {3}

    """.format(inputBam, genomeSizes, transientFile, outputBigWig, genome,
               os.path.abspath(os.path.dirname(os.path.realpath(__file__))))
    else:
        command = """
    # Making bigWig tracks from bam file
    echo "making bigWig tracks from bam file"
    module load bedtools

    bedtools bamtobed -i {0} |
    genomeCoverageBed -5 -i stdin -bg -g {1} \\
    > {2}.cov

    bedGraphToBigWig {2}.cov \\
    {1} \\
    {3}

    # remove cov file
    if [[ -s {2}.cov ]]
        then
        rm {2}.cov
    fi

    chmod 755 {3}

    """.format(inputBam, genomeSizes, transientFile, outputBigWig,
               os.path.abspath(os.path.dirname(os.path.realpath(__file__))))

    return command


def addTrackToHub(sampleName, trackURL, trackHub, colour, fivePrime=""):
    command = """
    # Adding track to TrackHub
    echo "Adding track to TrackHub"
    echo "track type=bigWig name='{0} {1}' description='{0} {1}' """.format(sampleName, fivePrime)
    command += """height=32 visibility=full maxHeightPixels=32:32:25 bigDataUrl={0} color={1}" >> \\
    {2}

    chmod 755 {2}

    """.format(trackURL, colour, trackHub)

    return command


def linkToTrackHub(trackHubURL, fileName, genome):
    db = "org" if genome == "hg19" else "db"  # different database call for human
    genome = "human" if genome == "hg19" else genome  # change hg19 to human

    html = """
    <html>
        <head>
            <meta http-equiv="refresh" content="0; url=http://genome.ucsc.edu/cgi-bin/hgTracks?"""
    html += """{db}={genome}&hgt.customText={trackHubURL}" />
        </head>
    </html>
    """.format(trackHubURL=trackHubURL, genome=genome, db=db)

    with open(fileName, 'w') as handle:
        handle.write(textwrap.dedent(html))


def genomeWideCoverage(inputBam, genomeWindows, output):
    command = """
    # Calculate genome-wide coverage
    echo "Calculating genome-wide coverage"
    module load bedtools

    bedtools coverage -abam -counts \\
    -a {0} \\
    -b {1} \\
    > {2}

    """.format(inputBam, genomeWindows, output)

    return command


def calculateFRiP(inputBam, inputBed, output):
    command = "cut -f 1,2,3 {0} ".format(inputBed)
    command += "| bedtools coverage -counts "
    command += "-abam {0} ".format(inputBam)
    command += "-b - "
    command += "| awk '{{sum+=$4}} END {{print sum}}' > {0}".format(output)

    return command


def macs2CallPeaks(treatmentBam, controlBam, outputDir, sampleName, genome, broad=False, plot=True):
    if genome == "hg19":
        genome = "hs"
    elif genome == "mm10":
        genome = "mm"
    elif genome == "dr7":
        raise ValueError("Genome dr7 not yet supported for peak calling with MACS2.")

    if not broad:
        command = """
    # Call peaks with MACS2
    echo "Calling peaks with MACS2"

    macs2 callpeak \\
    -t {0} \\
    -c {1} \\
    --bw 200 \\
    -g {2} -n {3} --outdir {4}

    """.format(treatmentBam, controlBam, genome, sampleName, outputDir)
        command = """
    # Call peaks with MACS2
    echo "Calling peaks with MACS2"

    macs2 callpeak \\
    -t {0} \\
    -c {1} \\
    --bw 200 \\
    --fix-bimodal --extsize 200 \\
    -g {2} -n {3} --outdir {4}

    """.format(treatmentBam, controlBam, genome, sampleName, outputDir)

    else:
        # Parameter setting for broad factors according to Nature Protocols (2012)
        # Vol.7 No.9 1728-1740 doi:10.1038/nprot.2012.101 Protocol (D) for H3K36me3
        command = """
    # Call peaks with MACS2
    echo "Calling peaks with MACS2"

    macs2 callpeak \\
    -t {0} \\
    -c {1} \\
    --broad --nomodel --extsize 73 --pvalue 1e-3 \\
    -g {2} -n {3} --outdir {4}

    """.format(treatmentBam, controlBam, genome, sampleName, outputDir)

    if plot and not broad:
        command += """
    # Plot MACS2 crosscorrelation

    echo "Plotting MACS2 crosscorrelation"
    module load R

    Rscript {0}/{1}_model.r

    mv {2}/{1}_model.pdf {0}/{1}_model.pdf

    """.format(outputDir, sampleName, os.getcwd())

    return command


def sppCallPeaks(treatmentBam, controlBam, treatmentName, controlName, outputDir, broad, cpus):
    broad = "TRUE" if broad else "FALSE"
    command = """
    # Calling peaks with SPP
    echo "calling peaks with SPP"
    module load R

    Rscript {6}/lib/spp_peak_calling.R \\
    {0} \\
    {1} \\
    {2} \\
    {3} \\
    {4} \\
    {5} \\
    {6}

    """.format(treatmentBam, controlBam, treatmentName, controlName, broad, cpus,
               os.path.abspath(os.path.dirname(os.path.realpath(__file__))))

    return command


def homerFindMotifs(peakFile, genome, outputDir, size=150, length="8,10,12,14,16", n_motifs=12):
    command = """
    # Find motifs with Homer
    echo "finding motifs with Homer"

    findMotifsGenome.pl {0} \\
    {1} \\
    {2} \\
    -mask -size {3} -len {4} -S {5}

    """.format(peakFile, genome, outputDir, size, length, n_motifs)

    return command


def AnnotatePeaks(peakFile, genome, motifFile, outputBed):
    command = """
    # Annotate peaks with motif score
    echo "annotating peaks with motif score"

    annotatePeaks.pl {0} \\
    {1} \\
    -mask -mscore -m {2} | \\
    tail -n +2 | cut -f 1,5,22 \\
    > {3}
    """.format(peakFile, genome, motifFile, outputBed)

    return command


def centerPeaksOnMotifs(peakFile, genome, windowWidth, motifFile, outputBed):
    command = """
    # Center peaks on motif
    echo "centering peaks on motif"

    annotatePeaks.pl {0} \\
    {1} \\
    -size {2} -center {3} | \\
    awk -v OFS='\\t' '{{print $2, $3, $4, $1, $6, $5}}' | \\
    awk -v OFS='\\t' -F '\t' '{{ gsub("0", "+", $6) ; gsub("1", "-", $6) ; print }}' | \\
    python {5}/lib/fix_bedfile_genome_boundaries.py {1} | \\
    sortBed > {4}
    """.format(peakFile, genome, windowWidth, motifFile, outputBed,
               os.path.abspath(os.path.dirname(os.path.realpath(__file__))))

    return command


def peakAnalysis(inputBam, peakFile, plotsDir, windowWidth, fragmentsize,
                 genome, n_clusters, strand_specific, duplicates):
    command = """
    # Analyse peak profiles
    echo "Analysing peak profiles"

    python {0}/lib/peaks_analysis.py {1} \\
    {2} \\
    {3} \\
    --window-width {4} \\
    --fragment-size {5} \\
    --genome {6} \\
    --n_clusters {7} """.format(
        os.path.abspath(os.path.dirname(os.path.realpath(__file__))),
        inputBam, peakFile, plotsDir, windowWidth, fragmentsize, genome, n_clusters
    )
    if strand_specific:
        command += "--strand-specific "
    if duplicates:
        command += "--duplicates\n"

    return command


def tssAnalysis(inputBam, tssFile, plotsDir, windowWidth, fragmentsize, genome,
                n_clusters, strand_specific, duplicates):
    command = """
    # Analyse signal over TSSs
    echo "Analysing signal over TSSs"

    python {0}/lib/tss_analysis.py {1} \\
    {2} \\
    {3} \\
    --window-width {4} \\
    --fragment-size {5} \\
    --genome {6} \\
    --n_clusters {7} """.format(
        os.path.abspath(os.path.dirname(os.path.realpath(__file__))),
        inputBam, tssFile, plotsDir, windowWidth, fragmentsize, genome, n_clusters
    )
    if strand_specific:
        command += "--strand-specific "
    if duplicates:
        command += "--duplicates "

    return command


def footprintAnalysis():
    raise NotImplementedError("Function not implemented yet.")


def plotCorrelations(inputCoverage, plotsDir):
    command = """
    # Plot correlations
    echo "plotting correlations"

    python {2}/lib/correlations.py \\
    {0} \\
    {1}

    """.format(plotsDir,
               " ".join(["%s"] * len(inputCoverage)) % tuple(inputCoverage),
               os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
               )

    return command


def diffBind(inputCSV, jobName, plotsDir):
    command = """
    # Detect differential binding with diffBind
    echo "Detecting differential binding with diffBind"
    module load R

    Rscript {3}/lib/diffBind_analysis.R \\
    {0} \\
    {1} \\
    {2} \\

    """.format(inputCSV, jobName, plotsDir,
               os.path.abspath(os.path.dirname(os.path.realpath(__file__))))

    return command


if __name__ == '__main__':
    try:
        main()
    except KeyboardInterrupt:
        print("Program canceled by user!")
        sys.exit(1)