Created
September 26, 2019 09:26
-
-
Save afrendeiro/b39f8bd0b013bc5342f459da5762a609 to your computer and use it in GitHub Desktop.
An old pipeline for ChIP-seq/ChIPmentation data processing used back in 2014. Uploading for preservation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
ChIP-seq pipeline | |
""" | |
from argparse import ArgumentParser | |
import os | |
import sys | |
import logging | |
import pandas as pd | |
import numpy as np | |
import time | |
import random | |
import re | |
import string | |
import textwrap | |
import shutil | |
import subprocess | |
# TODO: solve pandas chained assignments | |
pd.options.mode.chained_assignment = None | |
__author__ = "Andre Rendeiro" | |
__copyright__ = "Copyright 2014, Andre Rendeiro" | |
__credits__ = [] | |
__license__ = "GPL2" | |
__version__ = "0.1" | |
__maintainer__ = "Andre Rendeiro" | |
__email__ = "arendeiro@cemm.oeaw.ac.at" | |
__status__ = "Development" | |
def main(): | |
# Parse command-line arguments | |
parser = ArgumentParser(description="ChIP-seq pipeline.") | |
# Global options | |
# positional arguments | |
# optional arguments | |
parser.add_argument("-r", "--project-root", default="/fhgfs/groups/lab_bock/shared/projects/", | |
dest="project_root", type=str, | |
help="""Directory in which the project will reside. | |
Default=/fhgfs/groups/lab_bock/shared/projects/.""") | |
parser.add_argument("--html-root", default="/fhgfs/groups/lab_bock/public_html/arendeiro/", | |
dest="html_root", type=str, | |
help="""public_html directory in which bigwig files for the project will reside. | |
Default=/fhgfs/groups/lab_bock/public_html/.""") | |
parser.add_argument("--url-root", default="http://www.biomedical-sequencing.at/bocklab/arendeiro/", | |
dest="url_root", type=str, | |
help="""Url mapping to public_html directory where bigwig files for the project will be accessed. | |
Default=http://www.biomedical-sequencing.at/bocklab.""") | |
parser.add_argument("--keep-tmp-files", dest="keep_tmp", action="store_true", | |
help="Keep intermediary files. If not it will only preserve final files. Default=False.") | |
parser.add_argument("-c", "--cpus", default=4, dest="cpus", | |
help="Number of CPUs to use. Default=4.", type=int) | |
parser.add_argument("-m", "--mem-per-cpu", default=4000, dest="mem", | |
help="Memory per CPU to use. Default=4000.", type=int) | |
parser.add_argument("-q", "--queue", default="shortq", dest="queue", | |
choices=["develop", "shortq", "mediumq", "longq"], | |
help="Queue to submit jobs to. Default=shortq", type=str) | |
parser.add_argument("-t", "--time", default="10:00:00", dest="time", | |
help="Maximum time for jobs to run. Default=10:00:00", type=str) | |
parser.add_argument("--user-mail", default="", dest="user_mail", | |
help="User mail address. Default=<submitting user>.", type=str) | |
parser.add_argument("-l", "--log-level", default="INFO", dest="log_level", | |
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], | |
help="Logging level. Default=INFO.", type=str) | |
parser.add_argument("--dry-run", dest="dry_run", action="store_true", | |
help="Dry run. Assemble commands, but do not submit jobs to slurm. Default=False.") | |
parser.add_argument("--no-checks", dest="checks", action="store_false", | |
help="Don't check file existence and integrity. Default=False.") | |
# Sub commands | |
subparser = parser.add_subparsers(title="sub-command", dest="command") | |
# preprocess | |
preprocess_subparser = subparser.add_parser("preprocess") | |
preprocess_subparser.add_argument(dest="project_name", help="Project name.", type=str) | |
preprocess_subparser.add_argument(dest="csv", help="CSV file with sample annotation.", type=str) | |
preprocess_subparser.add_argument("-s", "--stage", default="all", dest="stage", | |
choices=["all", "bam2fastq", "fastqc", "trim", "mapping", | |
"shiftreads", "markduplicates", "removeduplicates", | |
"indexbam", "maketracks", "coverage"], | |
help="Run only these stages. Default=all.", type=str) | |
preprocess_subparser.add_argument("--trimmer", default="skewer", choices=["trimmomatic", "skewer"], | |
dest="trimmer", help="Trimmer to use. Default=skewer.", type=str) | |
preprocess_subparser.add_argument("-i", "--max-insert-size", default=2000, | |
dest="maxinsert", | |
help="Maximum allowed insert size allowed for paired end mates. Default=2000.", | |
type=int) | |
preprocess_subparser.add_argument("--window-size", default=1000, dest="windowsize", | |
help="Window size used for genome-wide correlations. Default=1000.", | |
type=int) | |
# stats | |
stats_subparser = subparser.add_parser("stats") | |
stats_subparser.add_argument(dest="project_name", help="Project name.", type=str) | |
stats_subparser.add_argument(dest="csv", help="CSV file with sample annotation.", type=str) | |
# analyse | |
analyse_subparser = subparser.add_parser("analyse") | |
analyse_subparser.add_argument(dest="project_name", help="Project name.", type=str) | |
analyse_subparser.add_argument(dest="csv", help="CSV file with sample annotation.", type=str) | |
analyse_subparser.add_argument("-s", "--stage", default="all", dest="stage", | |
choices=["all", "qc", "callpeaks", "findmotifs", "centerpeaks", | |
"annotatepeaks", "peakanalysis", "tssanalysis", "footprints", "frip"], | |
help="Run only these stages. Default=all.", type=str) | |
analyse_subparser.add_argument("--peak-caller", default="macs2", choices=["macs2", "spp"], | |
dest="peak_caller", help="Peak caller to use. Default=macs2.", type=str) | |
analyse_subparser.add_argument("--peak-window-width", default=2000, | |
dest="peak_window_width", | |
help="Width of window around peak motifs. Default=2000.", | |
type=int) | |
# compare | |
compare_subparser = subparser.add_parser("compare") | |
compare_subparser.add_argument(dest="project_name", help="Project name.", type=str) | |
compare_subparser.add_argument(dest="csv", help="CSV file with sample annotation.", type=str) | |
compare_subparser.add_argument("-s", "--stage", default="all", dest="stage", | |
choices=["all", "diffbind", "correlations"], | |
help="Run only these stages. Default=all.", type=str) | |
compare_subparser.add_argument("--duplicates", dest="duplicates", action="store_true", | |
help="Allow duplicates in coorelation analysis. Default=False.") | |
compare_subparser.add_argument("--genome-window-width", default=1000, | |
dest="genome_window_width", | |
help="Width of window to make genome-wide correlations. Default=1000.", | |
type=int) | |
# Parse | |
args = parser.parse_args() | |
# Logging | |
logger = logging.getLogger(__name__) | |
levels = { | |
"DEBUG": logging.DEBUG, | |
"INFO": logging.INFO, | |
"WARNING": logging.WARNING, | |
"ERROR": logging.ERROR, | |
"CRITICAL": logging.CRITICAL | |
} | |
logger.setLevel(levels[args.log_level]) | |
# create a file handler | |
# (for now in current working dir, in the end copy log to projectDir) | |
handler = logging.FileHandler(os.path.join(os.getcwd(), args.project_name + ".log")) | |
handler.setLevel(logging.INFO) | |
# format logger | |
formatter = logging.Formatter(fmt='%(levelname)s: %(asctime)s - %(message)s', datefmt='%Y/%m/%d %H:%M:%S') | |
handler.setFormatter(formatter) | |
logger.addHandler(handler) | |
# create a stout handler | |
stdout = logging.StreamHandler(sys.stdout) | |
stdout.setLevel(logging.ERROR) | |
formatter = logging.Formatter(fmt='%(levelname)s: %(asctime)s - %(message)s', datefmt='%Y/%m/%d %H:%M:%S') | |
stdout.setFormatter(formatter) | |
logger.addHandler(stdout) | |
# Start main function | |
if args.command == "preprocess": | |
logger, fr, to = preprocess(args, logger) | |
elif args.command == "stats": | |
logger, fr, to = readStats(args, logger) | |
elif args.command == "analyse": | |
logger, fr, to = analyse(args, logger) | |
elif args.command == "compare": | |
logger, fr, to = compare(args, logger) | |
# Exit | |
logger.info("Finished and exiting.") | |
# Copy log to projectDir | |
shutil.copy2(fr, to) | |
sys.exit(1) | |
def checkProjectDirs(args, logger): | |
# Directories and paths | |
# check args.project_root exists and user has write access | |
args.project_root = os.path.abspath(args.project_root) | |
logger.debug("Checking if %s directory exists and is writable." % args.project_root) | |
if not os.access(args.project_root, os.W_OK): | |
logger.error("%s does not exist, or user has no write access.\n\ | |
Use option '-r' '--project-root' to set a non-default project root path." % args.project_root) | |
sys.exit(1) | |
# check args.html_root exists and user has write access | |
htmlDir = os.path.abspath(args.html_root) | |
logger.debug("Checking if %s directory exists and is writable." % args.project_root) | |
if not os.access(htmlDir, os.W_OK): | |
logger.error("%s does not exist, or user has no write access.\n\ | |
Use option '--html-root' to set a non-default html root path." % htmlDir) | |
sys.exit(1) | |
# project directories | |
projectDir = os.path.join(args.project_root, args.project_name) | |
dataDir = os.path.join(projectDir, "data") | |
resultsDir = os.path.join(projectDir, "results") | |
# make relative project dirs | |
dirs = [ | |
projectDir, | |
os.path.join(projectDir, "runs"), | |
dataDir, | |
os.path.join(dataDir, "fastq"), | |
os.path.join(dataDir, "fastqc"), | |
os.path.join(dataDir, "raw"), | |
os.path.join(dataDir, "mapped"), | |
os.path.join(dataDir, "coverage"), | |
os.path.join(dataDir, "peaks"), | |
os.path.join(dataDir, "motifs"), | |
resultsDir, | |
os.path.join(resultsDir, "plots"), | |
os.path.join(resultsDir, "plots", "cdt"), | |
os.path.join(resultsDir, "plots", "pickles"), | |
htmlDir, | |
os.path.join(args.html_root, args.project_name), | |
os.path.join(args.html_root, args.project_name, "bigWig") | |
] | |
for d in dirs: | |
if not os.path.exists(d): | |
os.makedirs(d) | |
# chmod of paths to public_html folder | |
html = [ | |
htmlDir, | |
os.path.join(args.html_root, args.project_name), | |
os.path.join(args.html_root, args.project_name, "bigWig") | |
] | |
for d in html: | |
try: | |
os.chmod(d, 0755) | |
except OSError: | |
logger.error("cannot change folder's mode: %s" % d) | |
continue | |
htmlDir = os.path.join(args.html_root, args.project_name, "bigWig") | |
urlRoot = args.url_root + args.project_name + "/bigWig/" | |
return (htmlDir, projectDir, dataDir, resultsDir, urlRoot) | |
def getReplicates(samples): | |
""" | |
Returns new sample annotation sheet with provided samples, plus biological replicates | |
(merged technical replicates) and merged biological replicates. | |
samples - a pandas.DataFrame with sample info. | |
""" | |
# ignore some fields in the annotation sheet | |
varsName = ["cellLine", "numberCells", "technique", "ip", "patient", "treatment", "biologicalReplicate", "technicalReplicate", "genome"] | |
# get sample names | |
samples["sampleName"] = ["_".join([str(j) for j in samples.ix[i][varsName]]) for i in samples.index] | |
samplesMerged = samples.copy() | |
# get merged technical replicates -> biological replicates | |
variables = ["cellLine", "numberCells", "technique", "ip", "patient", "treatment", "biologicalReplicate", "genome"] | |
for key, values in samples.groupby(variables).groups.items(): | |
rep = samples.ix[values][varsName].reset_index(drop=True).ix[0] | |
if len(values) > 1: | |
rep["experimentName"] = np.nan | |
rep["technicalReplicate"] = 0 | |
rep["unmappedBam"] = samples.ix[values]["unmappedBam"].tolist() | |
rep["sampleName"] = "_".join([str(i) for i in rep[varsName]]) | |
# append biological replicate to sample annotation | |
samplesMerged = samplesMerged.append(rep, ignore_index=True) | |
# get merged biological replicates -> merged biological replicates | |
variables = ["cellLine", "numberCells", "technique", "ip", "patient", "treatment", "genome"] | |
for key, values in samples.groupby(variables).groups.items(): | |
rep = samples.ix[values][varsName].reset_index(drop=True).ix[0] | |
if len(values) > 1: | |
# check samples in group are from different biological replicates | |
if len(samples.ix[samples.groupby(variables).groups[key]]['biologicalReplicate'].unique()) > 1: | |
rep["experimentName"] = np.nan | |
rep["biologicalReplicate"] = 0 | |
rep["technicalReplicate"] = 0 | |
rep["unmappedBam"] = samples.ix[values]["unmappedBam"].tolist() | |
rep["sampleName"] = "_".join([str(i) for i in rep[varsName]]) | |
# append biological replicate to sample annotation | |
samplesMerged = samplesMerged.append(rep, ignore_index=True) | |
# replace sample name with list of sample names for only one sample (original case) | |
for i in range(len(samplesMerged)): | |
if type(samplesMerged["unmappedBam"][i]) is not list: | |
samplesMerged["unmappedBam"][i] = [samplesMerged["unmappedBam"][i]] | |
return samplesMerged.sort(["sampleName"]) | |
def preprocess(args, logger): | |
""" | |
This takes unmapped Bam files and makes trimmed, aligned, indexed (and shifted if necessary) | |
Bam files along with a UCSC browser track. | |
""" | |
logger.info("Starting sample preprocessing.") | |
logger.debug("Checking project directories exist and creating if not.") | |
htmlDir, projectDir, dataDir, resultsDir, urlRoot = checkProjectDirs(args, logger) | |
# Paths to static files on the cluster | |
genomeFolder = "/fhgfs/prod/ngs_resources/genomes/" | |
genomeIndexes = { | |
"hg19": "/home/arendeiro/reference/Homo_sapiens/forBowtie2/hg19", | |
"mm10": os.path.join(genomeFolder, "mm10/forBowtie2/mm10/index_woRandom/mm10"), | |
"dr7": os.path.join(genomeFolder, "dr7/forBowtie2/dr7") | |
} | |
genomeSizes = { | |
"hg19": "/fhgfs/groups/lab_bock/arendeiro/share/hg19.chrom.sizes", | |
"mm10": "/fhgfs/groups/lab_bock/arendeiro/share/mm10.chrom.sizes", | |
"dr7": "/fhgfs/groups/lab_bock/arendeiro/share/danRer7.chrom.sizes" | |
} | |
genomeWindows = { | |
"hg19": "/fhgfs/groups/lab_bock/arendeiro/share/hg19.genomeWindows_1kb.bed", | |
"mm10": "/fhgfs/groups/lab_bock/arendeiro/share/mm10.genomeWindows_1kb.bed", | |
"dr7": "/fhgfs/groups/lab_bock/arendeiro/share/danRer7.genomeWindows_1kb.bed" | |
} | |
adapterFasta = "/fhgfs/groups/lab_bock/shared/cm.fa" | |
# Other static info | |
tagment = [ | |
"DNASE", "DNASESEQ", "DNASE-SEQ", "DHS", "DHS-SEQ", "DHSSEQ", | |
"ATAC", "ATAC-SEQ", "ATACSEQ", | |
"CM" | |
] | |
# from the ggplot2 color blind pallete | |
# #999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7" | |
# shortly: green,red=active, orange=transcription, blue=repression, yellow=potential | |
colours = { | |
"IGG": "153,153,153", "INPUT": "153,153,153", # grey | |
"H3K36ME1": "230,159,0", "H3K36ME2": "230,159,0", "H3K36ME3": "230,159,0", # orange | |
"H3K4ME3": "0,158,115", # bluish green | |
"H3K4ME1": "120,114,33", "H3K14ac": "120,114,33", # yellow | |
"H3K27ME1": "0,114,178", "H3K27ME2": "0,114,178", "H3K27ME3": "0,114,178", # blue | |
"H3K9ME1": "86,180,233", "H3K9ME2": "86,180,233", "H3K9ME3": "86,180,233", # sky blue | |
"H3AC": "213,94,0", "H3K9AC": "213,94,0", "H3K27AC": "213,94,0", "H3K56AC": "213,94,0", "H3K56AC": "213,94,0", # vermillion | |
"H3K79ME1": "204,121,167", "H3K79ME2": "204,121,167", "H3K79ME3": "204,121,167", # reddish purple | |
"ATAC": "0,158,115", | |
"DNASE": "0,158,115", | |
} | |
colour_gradient = [ # 10 colour gradient from red to blue | |
"155,3,5", "140,2,18", "125,2,31", "110,2,44", "96,2,57", | |
"81,2,70", "66,2,83", "52,2,96", "37,2,109", "22,2,122" | |
] | |
# Parse sample information | |
args.csv = os.path.abspath(args.csv) | |
# check if exists and is a file | |
if not os.path.isfile(args.csv): | |
logger.error("Sample annotation '%s' does not exist, or user has no read access." % args.csv) | |
sys.exit(1) | |
# read in | |
samples = pd.read_csv(args.csv) | |
# TODO: | |
# Perform checks on the variables given | |
# (e.g. genome in genomeIndexes) | |
# start pipeline | |
projectName = string.join([args.project_name, time.strftime("%Y%m%d-%H%M%S")], sep="_") | |
# Get biological replicates and merged biological replicates | |
logger.debug("Checking which samples should be merged.") | |
samplesMerged = getReplicates(samples.copy()) | |
# ^^ this is the new annotation sheet as well | |
samplesMerged['readType'] = None | |
samplesMerged['readLength'] = None | |
# add field for manual sample pairing | |
samplesMerged["controlSampleName"] = None | |
# Preprocess samples | |
for sample in range(len(samplesMerged)): | |
# Get sample name | |
sampleName = samplesMerged["sampleName"][sample] | |
# get jobname | |
jobName = projectName + "_" + sampleName | |
# check if sample is paired end | |
PE = False | |
# check if sample is tagmented or not: | |
tagmented = True | |
# get intermediate names for files | |
if len(samplesMerged["unmappedBam"][sample]) == 1: | |
unmappedBam = samplesMerged["unmappedBam"][sample][0] | |
else: | |
unmappedBam = os.path.join(dataDir, "raw", sampleName + ".bam") | |
if not tagmented: | |
bam = os.path.join(dataDir, "mapped", sampleName + ".trimmed.bowtie2") | |
else: | |
bam = os.path.join(dataDir, "mapped", sampleName + ".trimmed.bowtie2.shifted") | |
# get colour for tracks | |
if str(samplesMerged["ip"][sample]).upper() in colours.keys(): | |
colour = colours[samplesMerged["ip"][sample].upper()] | |
else: | |
if samplesMerged["technique"][sample] in ["ATAC", "ATACSEQ", "ATAC-SEQ"]: | |
colour = colours["ATAC"] | |
elif samplesMerged["technique"][sample] in ["DNASE", "DNASESEQ", "DNASE-SEQ"]: | |
colour = colours["DNASE"] | |
else: | |
colour = random.sample(colour_gradient, 1)[0] # pick one randomly | |
# keep track of temporary files | |
tempFiles = list() | |
# assemble commands | |
# get job header | |
jobCode = slurmHeader( | |
jobName=jobName, | |
output=os.path.join(projectDir, "runs", jobName + ".slurm.log"), | |
queue=args.queue, | |
time=args.time, | |
cpusPerTask=args.cpus, | |
memPerCpu=args.mem, | |
userMail=args.user_mail | |
) | |
if args.stage in ["all", "trim"]: | |
# TODO: | |
# Change absolute path to something usable by everyone or to an option. | |
if args.trimmer == "trimmomatic": | |
jobCode += trimmomatic( | |
inputFastq1=samplesMerged["unmappedBam"][sample], | |
inputFastq2=os.path.join(dataDir, "fastq", sampleName + ".2.fastq") if PE else None, | |
outputFastq1=os.path.join(dataDir, "fastq", sampleName + ".trimmed.fastq") if not PE else os.path.join(dataDir, "fastq", sampleName + ".1.trimmed.fastq"), | |
outputFastq1unpaired=os.path.join(dataDir, "fastq", sampleName + ".1_unpaired.trimmed.fastq") if PE else None, | |
outputFastq2=os.path.join(dataDir, "fastq", sampleName + ".2.trimmed.fastq") if PE else None, | |
outputFastq2unpaired=os.path.join(dataDir, "fastq", sampleName + ".2_unpaired.trimmed.fastq") if PE else None, | |
cpus=args.cpus, | |
adapters=adapterFasta, | |
log=os.path.join(resultsDir, sampleName + ".trimlog.txt") | |
) | |
if not PE: | |
tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".trimmed.fastq")) | |
else: | |
tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".1.trimmed.fastq")) | |
tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".2.trimmed.fastq")) | |
tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".1_unpaired.trimmed.fastq")) | |
tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".2_unpaired.trimmed.fastq")) | |
if args.trimmer == "skewer": | |
jobCode += skewer( | |
inputFastq1=samplesMerged["unmappedBam"][sample] if not PE else os.path.join(dataDir, "fastq", sampleName + ".1.fastq"), | |
inputFastq2=os.path.join(dataDir, "fastq", sampleName + ".2.fastq") if PE else None, | |
outputPrefix=os.path.join(dataDir, "fastq", sampleName), | |
cpus=args.cpus, | |
adapters=adapterFasta | |
) | |
# move files to have common name | |
if not PE: | |
jobCode += moveFile( | |
os.path.join(dataDir, "fastq", sampleName + "-trimmed.fastq"), | |
os.path.join(dataDir, "fastq", sampleName + ".trimmed.fastq") | |
) | |
tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".trimmed.fastq")) | |
else: | |
jobCode += moveFile( | |
os.path.join(dataDir, "fastq", sampleName + "-trimmed-pair1.fastq"), | |
os.path.join(dataDir, "fastq", sampleName + ".1.trimmed.fastq") | |
) | |
jobCode += moveFile( | |
os.path.join(dataDir, "fastq", sampleName + "-trimmed-pair2.fastq"), | |
os.path.join(dataDir, "fastq", sampleName + ".2.trimmed.fastq") | |
) | |
tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".1.trimmed.fastq")) | |
tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".2.trimmed.fastq")) | |
# move log to results dir | |
jobCode += moveFile( | |
os.path.join(dataDir, "fastq", sampleName + "-trimmed.log"), | |
os.path.join(resultsDir, sampleName + ".trimlog.txt") | |
) | |
if args.stage in ["all", "mapping"]: | |
if samplesMerged["genome"][sample] not in genomeIndexes: | |
logger.error("Sample %s has unsuported genome index: %s" % (sampleName, samplesMerged["genome"][sample])) | |
sys.exit(1) | |
jobCode += bowtie2Map( | |
inputFastq1=os.path.join(dataDir, "fastq", sampleName + ".1.trimmed.fastq") if PE else os.path.join(dataDir, "fastq", sampleName + ".trimmed.fastq"), | |
inputFastq2=os.path.join(dataDir, "fastq", sampleName + ".2.trimmed.fastq") if PE else None, | |
outputBam=os.path.join(dataDir, "mapped", sampleName + ".trimmed.bowtie2.bam"), | |
log=os.path.join(resultsDir, sampleName + ".alnRates.txt"), | |
metrics=os.path.join(resultsDir, sampleName + ".alnMetrics.txt"), | |
genomeIndex=genomeIndexes[samplesMerged["genome"][sample]], | |
maxInsert=args.maxinsert, | |
cpus=args.cpus | |
) | |
tempFiles.append(os.path.join(dataDir, "mapped", sampleName + ".trimmed.bowtie2.bam")) | |
if args.stage in ["all", "shiftreads"]: | |
if tagmented: | |
jobCode += shiftReads( | |
inputBam=os.path.join(dataDir, "mapped", sampleName + ".trimmed.bowtie2.bam"), | |
genome=samplesMerged["genome"][sample], | |
outputBam=bam + ".bam" | |
) | |
if args.stage in ["all", "markduplicates"]: | |
jobCode += markDuplicates( | |
inputBam=bam + ".bam", | |
outputBam=bam + ".dups.bam", | |
metricsFile=os.path.join(resultsDir, sampleName + ".duplicates.txt") # | |
# tempDir= | |
) | |
if args.stage in ["all", "removeduplicates"]: | |
jobCode += removeDuplicates( | |
inputBam=bam + ".dups.bam", | |
outputBam=bam + ".nodups.bam", | |
cpus=args.cpus | |
) | |
if args.stage in ["all", "indexbam"]: | |
jobCode += indexBam( | |
inputBam=bam + ".dups.bam" | |
) | |
jobCode += indexBam( | |
inputBam=bam + ".nodups.bam" | |
) | |
if args.stage in ["all", "maketracks"]: | |
# right now tracks are only made for bams with duplicates | |
jobCode += bamToUCSC( | |
inputBam=bam + ".dups.bam", | |
outputBigWig=os.path.join(htmlDir, sampleName + ".bigWig"), | |
genomeSizes=genomeSizes[samplesMerged["genome"][sample]], | |
genome=samplesMerged["genome"][sample], | |
tagmented=False | |
) | |
jobCode += addTrackToHub( | |
sampleName=sampleName, | |
trackURL=urlRoot + sampleName + ".bigWig", | |
trackHub=os.path.join(htmlDir, "trackHub_{0}.txt".format(samplesMerged["genome"][sample])), | |
colour=colour | |
) | |
if tagmented: | |
jobCode += bamToUCSC( | |
inputBam=bam + ".dups.bam", | |
outputBigWig=os.path.join(htmlDir, sampleName + ".5prime.bigWig"), | |
genomeSizes=genomeSizes[samplesMerged["genome"][sample]], | |
genome=samplesMerged["genome"][sample], | |
tagmented=True | |
) | |
jobCode += addTrackToHub( | |
sampleName=sampleName, | |
trackURL=urlRoot + sampleName + ".5prime.bigWig", | |
trackHub=os.path.join(htmlDir, "trackHub_{0}.txt".format(samplesMerged["genome"][sample])), | |
fivePrime="5prime", | |
colour=colour | |
) | |
linkToTrackHub( | |
trackHubURL="{0}/{1}/bigWig/trackHub_{2}.txt".format(args.url_root, args.project_name, samplesMerged["genome"][sample]), | |
fileName=os.path.join(projectDir, "ucsc_tracks_{0}.html".format(samplesMerged["genome"][sample])), | |
genome=samplesMerged["genome"][sample] | |
) | |
if args.stage in ["all", "coverage"]: | |
jobCode += genomeWideCoverage( | |
inputBam=bam + ".nodups.bam", | |
genomeWindows="/fhgfs/groups/lab_bock/arendeiro/share/hg19.genomeWindows_1kb.filteredMappability50mer.filteredBlacklisted.bed", | |
output=os.path.join(dataDir, "coverage", sampleName + ".cov") | |
) | |
# Remove intermediary files | |
if args.stage == "all" and not args.keep_tmp: | |
logger.debug("Removing intermediary files") | |
for fileName in tempFiles: | |
jobCode += removeFile(fileName) | |
# Submit job to slurm | |
# Get concatenated string with code from all modules | |
jobCode += slurmFooter() | |
# Output file name | |
jobFile = os.path.join(projectDir, "runs", jobName + ".sh") | |
with open(jobFile, 'w') as handle: | |
handle.write(textwrap.dedent(jobCode)) | |
# Submit to slurm | |
if not args.dry_run: | |
logger.info("Submitting job to slurm: %s" % jobName) | |
status = slurmSubmitJob(jobFile) | |
if status != 0: | |
logger.error("Slurm job '%s' not successfull" % jobFile) | |
sys.exit(1) | |
logger.debug("Project '%s'submission finished successfully." % args.project_name) | |
# Write location of bam file in merged samples annotation sheet | |
samplesMerged['unmappedBam'][sample] = bam + ".dups.bam" | |
# write original annotation sheet to project folder | |
samples.to_csv(os.path.join(projectDir, args.project_name + ".annotation_sheet.csv"), index=False) | |
# write annotation sheet with biological replicates to project folder | |
samplesMerged.to_csv( | |
os.path.join(projectDir, args.project_name + ".replicates.annotation_sheet.csv"), | |
index=False | |
) | |
logger.debug("Finished preprocessing") | |
return (logger, | |
os.path.join(os.getcwd(), args.project_name + ".log"), | |
os.path.join(projectDir, "runs", args.project_name + ".log")) | |
def readStats(args, logger): | |
""" | |
Given an annotation sheet with replicates, gets number of reads mapped, duplicates, etc... | |
""" | |
logger.info("Starting sample read stats.") | |
logger.debug("Checking project directories exist and creating if not.") | |
htmlDir, projectDir, dataDir, resultsDir, urlRoot = checkProjectDirs(args, logger) | |
# Parse sample information | |
args.csv = os.path.abspath(args.csv) | |
# check if exists and is a file | |
if not os.path.isfile(args.csv): | |
logger.error("Sample annotation '%s' does not exist, or user has no read access." % args.csv) | |
sys.exit(1) | |
# read in | |
samples = pd.read_csv(args.csv) | |
variables = ["cellLine", "numberCells", "technique", "ip", "patient", | |
"treatment", "biologicalReplicate", "technicalReplicate", "genome"] | |
cols = ["readCount", "unpaired", "unaligned", "unique", "multiple", "alignmentRate"] | |
for col in cols: | |
samples[col] = None | |
cols = ["unpairedReadsExamined", "readPairsExamined", "unmappedReads", "unpairedReadDuplicates", | |
"readPairDuplicates", "readPairOpticalDuplicates", "percentDuplication", "estimatedLibrarySize"] | |
samples["totalReads"] = None | |
for col in cols: | |
samples[col] = None | |
if "peakFile" in samples.columns: | |
samples["NSC"] = None | |
samples["RSC"] = None | |
samples["peakNumber"] = None | |
samples["FRiP"] = None | |
for sample in xrange(len(samples)): | |
# Sample name | |
# if sampleName is not provided, use a concatenation of several variable (excluding longest) | |
if str(samples["sampleName"][sample]) != "nan": | |
sampleName = samples["sampleName"][sample] | |
else: | |
sampleName = string.join([str(samples[var][sample]) for var in variables], sep="_") | |
logger.debug("No sample name provided, using concatenation of variables supplied") | |
# Get alignment stats | |
try: | |
# get stats into a series | |
alnStats = parseBowtieStats(os.path.join(resultsDir, sampleName + ".alnRates.txt")) | |
# add to dataFrame | |
samples.loc[sample, alnStats.index.tolist()] = alnStats | |
except: | |
logger.warn("Record with alignment rates is empty or not found for sample %s" % sampleName) | |
pass | |
# Get duplicate stats | |
try: | |
dups = pd.read_csv( | |
os.path.join(resultsDir, sampleName + ".duplicates.txt"), | |
sep="\t", | |
comment="#", | |
header=0 | |
) | |
# drop if empty | |
dups.dropna(thresh=len(dups.columns) - 1, inplace=True) | |
# Add values to sample sheet | |
for col in range(len(cols)): | |
samples.loc[sample, cols[col]] = dups.drop(["LIBRARY"], axis=1).ix[0][col] | |
except: | |
logger.warn("Record with duplicates is empty or not found for sample %s" % sampleName) | |
pass | |
# Get NSC and RSC | |
qcFile = os.path.join(resultsDir, "sample_QC.tsv") | |
qc = parseQC(sampleName, qcFile) | |
samples.loc[sample, "NSC"] = qc["NSC"] | |
samples.loc[sample, "RSC"] = qc["RSC"] | |
samples.loc[sample, "qualityTag"] = qc["qualityTag"] | |
# Count peak number (if peaks in sheet) | |
if "peakFile" in samples.columns: | |
# and if sample has peaks | |
if str(samples["peakFile"][sample]) != "nan": | |
proc = subprocess.Popen(["wc", "-l", samples["peakFile"][sample]], stdout=subprocess.PIPE) | |
out, err = proc.communicate() | |
samples.loc[sample, "peakNumber"] = re.sub("\D.*", "", out) | |
# Get FRiP from file (if exists) and add to sheet | |
if "peakFile" in samples.columns: | |
# if sample has peaks | |
if str(samples["peakFile"][sample]) != "nan": | |
try: | |
with open(os.path.join(resultsDir, sampleName + "_FRiP.txt"), "r") as handle: | |
content = handle.readlines() | |
readsInPeaks = int(re.sub("\D", "", content[0])) | |
mappedReads = samples["readCount"][sample] - samples["unaligned"][sample] | |
samples.loc[sample, "FRiP"] = readsInPeaks / mappedReads | |
except: | |
logger.warn("Record with FRiP value is empty or not found for sample %s" % sampleName) | |
pass | |
# convert duplicate rate to percentage | |
samples.loc[:, 'percentDuplication'] = samples['percentDuplication'] * 100 | |
# write annotation sheet with biological replicates | |
samples.to_csv(os.path.join(projectDir, args.project_name + ".read_stats.csv"), index=False) | |
logger.debug("Finished getting read statistics.") | |
return (logger, | |
os.path.join(os.getcwd(), args.project_name + ".log"), | |
os.path.join(projectDir, "runs", args.project_name + ".log")) | |
def analyse(args, logger): | |
logger.info("Starting sample analysis.") | |
logger.debug("Checking project directories exist and creating if not.") | |
htmlDir, projectDir, dataDir, resultsDir, urlRoot = checkProjectDirs(args, logger) | |
# Paths to static files on the cluster | |
# Other static info | |
tagment = [ | |
"DNASE", "DNASESEQ", "DNASE-SEQ", "DHS", "DHS-SEQ", "DHSSEQ", | |
"ATAC", "ATAC-SEQ", "ATACSEQ", | |
"CM" | |
] | |
histones = ["H2A", "H2B", "H3", "H4"] | |
broadFactors = [ | |
"H3K27ME1", "H3K27ME2", "H3K27ME3", | |
"H3K36ME1", "H3K36ME2", "H3K36ME3", | |
"H3K9ME1", "H3K9ME2", "H3K9ME3", | |
"H3K72ME1", "H3K72ME2", "H3K72ME3" | |
] | |
tssFiles = { | |
"hg19": "/fhgfs/groups/lab_bock/arendeiro/share/GRCh37_hg19_refSeq.tss.bed", | |
"mm10": "/fhgfs/groups/lab_bock/arendeiro/share/GRCm38_mm10_refSeq.tss.bed", | |
"dr7": "/fhgfs/groups/lab_bock/arendeiro/share/GRCh37_hg19_refSeq.tss.bed" | |
} | |
# Parse sample information | |
args.csv = os.path.abspath(args.csv) | |
# check if exists and is a file | |
if not os.path.isfile(args.csv): | |
logger.error("Sample annotation '%s' does not exist, or user has no read access." % args.csv) | |
sys.exit(1) | |
# read in | |
samples = pd.read_csv(args.csv) | |
samples["controlSampleunmappedBam"] = None | |
samples["peakFile"] = None | |
# TODO: | |
# Perform checks on the variables given | |
# (e.g. columns existing, bams existing) | |
# start pipeline | |
projectName = string.join([args.project_name, time.strftime("%Y%m%d-%H%M%S")], sep="_") | |
# Preprocess samples | |
variables = ["cellLine", "numberCells", "technique", "ip", "patient", | |
"treatment", "biologicalReplicate", "technicalReplicate", "genome"] | |
# track jobs to submit | |
jobs = dict() | |
for sample in xrange(len(samples)): | |
# Sample name | |
# if sampleName is not provided, use a concatenation of several variable (excluding longest) | |
if str(samples["sampleName"][sample]) != "nan": | |
sampleName = samples["sampleName"][sample] | |
else: | |
sampleName = string.join([str(samples[var][sample]) for var in variables], sep="_") | |
logger.debug("No sample name provided, using concatenation of variables supplied") | |
# Pair with control | |
control = False | |
# get file path of sample which this sample's controlName is pointing to | |
ctrlField = samples[samples['sampleName'].isin([samples['controlSampleName'][sample]])]['unmappedBam'] | |
# ^^ doesn't mean it has something (i.e. is empty if no control sample is indicated) | |
# if there is only one record, use that as control | |
if len(ctrlField) == 1 and type(ctrlField.values[0]) == str: | |
control = True | |
controlName = samples.ix[sample]["controlSampleName"] | |
controlBam = os.path.abspath(ctrlField.values[0]) | |
samples["controlSampleunmappedBam"][sample] = controlBam | |
# skip samples without matched control | |
if not control: | |
continue | |
# check if sample is tagmented or not: | |
tagmented = True if samples["technique"][sample] in tagment else False | |
if not tagmented: | |
nodupsbam = os.path.join(dataDir, "mapped", sampleName + ".trimmed.bowtie2.nodups.bam") | |
else: | |
nodupsbam = os.path.join(dataDir, "mapped", sampleName + ".trimmed.bowtie2.shifted.nodups.bam") | |
# Is it a histone? | |
histone = True if any([i in samples["ip"][sample] for i in histones]) else False | |
# Is it a broad factor? | |
broad = False if samples["ip"][sample].upper() not in broadFactors else True | |
# peakFile | |
peakFile = os.path.join( | |
dataDir, "peaks", sampleName, | |
sampleName + "_peaks" + (".narrowPeak" if not broad else ".broadPeak") | |
) | |
samples["peakFile"][sample] = peakFile | |
jobName = projectName + "_" + sampleName | |
tempFiles = list() | |
# assemble commands | |
jobCode = slurmHeader( | |
jobName=jobName, | |
output=os.path.join(projectDir, "runs", jobName + ".slurm.log"), | |
queue=args.queue, | |
time=args.time, | |
cpusPerTask=args.cpus, | |
memPerCpu=args.mem, | |
userMail=args.user_mail | |
) | |
if args.stage in ["all", "qc"]: | |
jobCode += peakTools( | |
inputBam=nodupsbam, | |
output=os.path.join(resultsDir, "sample_QC.tsv"), | |
plot=os.path.join(resultsDir, sampleName + "_QC.pdf"), | |
cpus=args.cpus | |
) | |
if args.stage in ["all", "callpeaks"] and control: | |
if args.peak_caller == "macs2": | |
# make dir for output | |
if not os.path.exists(os.path.join(dataDir, "peaks", sampleName)): | |
os.makedirs(os.path.join(dataDir, "peaks", sampleName)) | |
# For point-source factors use default settings | |
# For broad factors use broad settings | |
jobCode += macs2CallPeaks( | |
treatmentBam=os.path.abspath(samples['unmappedBam'][sample]), | |
controlBam=controlBam, | |
outputDir=os.path.join(dataDir, "peaks", sampleName), | |
sampleName=sampleName, | |
genome=samples['genome'][sample], | |
broad=True if broad else False | |
) | |
elif args.peak_caller == "spp": | |
# For point-source factors use default settings | |
# For broad factors use broad settings | |
jobCode += sppCallPeaks( | |
treatmentBam=os.path.abspath(samples['unmappedBam'][sample]), | |
controlBam=controlBam, | |
treatmentName=sampleName, | |
controlName=controlName, | |
outputDir=os.path.join(dataDir, "peaks", sampleName), | |
broad=True if broad else False, | |
cpus=args.cpus | |
) | |
if args.stage in ["all", "findmotifs"] and control: | |
# make dir for motifs | |
if not os.path.exists(os.path.join(dataDir, "motifs", sampleName)): | |
os.makedirs(os.path.join(dataDir, "motifs", sampleName)) | |
if not histone: | |
# For TFs, find the "self" motif | |
jobCode += homerFindMotifs( | |
peakFile=peakFile, | |
genome=samples["genome"][sample], | |
outputDir=os.path.join(dataDir, "motifs", sampleName), | |
size="50", | |
length="8,10,12,14,16", | |
n_motifs=8 | |
) | |
# For TFs, find co-binding motifs (broader region) | |
jobCode += homerFindMotifs( | |
peakFile=peakFile, | |
genome=samples["genome"][sample], | |
outputDir=os.path.join(dataDir, "motifs", sampleName + "_cobinders"), | |
size="200", | |
length="8,10,12,14,16", | |
n_motifs=12 | |
) | |
else: | |
# For histones, use a broader region to find motifs | |
jobCode += homerFindMotifs( | |
peakFile=peakFile, | |
genome=samples["genome"][sample], | |
outputDir=os.path.join(dataDir, "motifs", sampleName), | |
size="1000", | |
length="8,10,12,14,16", | |
n_motifs=20 | |
) | |
if args.stage in ["all", "centerpeaks"] and control: | |
# TODO: | |
# right now this assumes peaks were called with MACS2 | |
# figure a way of magetting the peak files withough using the peak_caller option | |
# for that would imply taht option would be required when selecting this stage | |
jobCode += centerPeaksOnMotifs( | |
peakFile=peakFile, | |
genome=samples["genome"][sample], | |
windowWidth=args.peak_window_width, | |
motifFile=os.path.join(dataDir, "motifs", sampleName, "homerResults", "motif1.motif"), | |
outputBed=os.path.join(dataDir, "peaks", sampleName, sampleName + "_peaks.motifCentered.bed") | |
) | |
if args.stage in ["all", "annotatepeaks"] and control: | |
# TODO: | |
# right now this assumes peaks were called with MACS2 | |
# figure a way of getting the peak files withough using the peak_caller option | |
# for that would imply taht option would be required when selecting this stage | |
jobCode += AnnotatePeaks( | |
peakFile=peakFile, | |
genome=samples["genome"][sample], | |
motifFile=os.path.join(dataDir, "motifs", sampleName, "homerResults", "motif1.motif"), | |
outputBed=os.path.join(dataDir, "peaks", sampleName, sampleName + "_peaks.motifAnnotated.bed") | |
) | |
if args.stage in ["all", "peakanalysis"] and control: | |
jobCode += peakAnalysis( | |
inputBam=os.path.abspath(samples.ix[sample]['unmappedBam']), | |
peakFile=os.path.join(dataDir, "peaks", sampleName, sampleName + "_peaks.motifCentered.bed"), | |
plotsDir=os.path.join(resultsDir, 'plots'), | |
windowWidth=2000, | |
fragmentsize=1 if tagmented else 50, # change this to actual read length | |
genome=samples.ix[sample]['genome'], | |
n_clusters=5, | |
strand_specific=True, | |
duplicates=True | |
) | |
if args.stage in ["all", "tssanalysis"] and control: | |
jobCode += tssAnalysis( | |
inputBam=os.path.abspath(samples.ix[sample]['unmappedBam']), | |
tssFile=tssFiles[samples.ix[sample]['genome']], | |
plotsDir=os.path.join(resultsDir, 'plots'), | |
windowWidth=2000, | |
fragmentsize=1 if tagmented else 50, # change this to actual read length | |
genome=samples.ix[sample]['genome'], | |
n_clusters=5, | |
strand_specific=True, | |
duplicates=True | |
) | |
if args.stage in ["all", "frip"]: | |
jobCode += calculateFRiP( | |
inputBam=os.path.abspath(samples['unmappedBam'][sample]), | |
inputBed=peakFile, | |
output=os.path.join(resultsDir, sampleName + "_FRiP.txt"), | |
) | |
# if args.stage in ["all", "footprints"] and control and tagmented: | |
# jobCode += footprintAnalysis( | |
# os.path.join(dataDir, "peaks", sampleName, sampleName + "_peaks.motifCentered.bed"), | |
# os.path.join(dataDir, "peaks", sampleName, sampleName + "_peaks.motifAnnotated.bed") | |
# ) | |
# finish job | |
jobCode += slurmFooter() | |
jobs[jobName] = jobCode | |
# Submit jobs to slurm | |
for jobName, jobCode in jobs.items(): | |
# Output file name | |
jobFile = os.path.join(projectDir, "runs", jobName + ".sh") | |
with open(jobFile, 'w') as handle: | |
handle.write(textwrap.dedent(jobCode)) | |
# Submit to slurm | |
if not args.dry_run: | |
logger.info("Submitting jobs to slurm") | |
status = slurmSubmitJob(jobFile) | |
if status != 0: | |
logger.error("Slurm job '%s' not successfull" % jobFile) | |
sys.exit(1) | |
logger.debug("Project '%s'submission finished successfully." % args.project_name) | |
# write original annotation sheet to project folder | |
samples.to_csv( | |
os.path.join(projectDir, args.project_name + ".replicates.annotation_sheet.csv"), | |
index=False | |
) | |
logger.debug("Finished comparison") | |
return (logger, | |
os.path.join(os.getcwd(), args.project_name + ".log"), | |
os.path.join(projectDir, "runs", args.project_name + ".log")) | |
def compare(args, logger): | |
logger.info("Starting sample comparison.") | |
logger.debug("Checking project directories exist and creating if not.") | |
htmlDir, projectDir, dataDir, resultsDir, urlRoot = checkProjectDirs(args, logger) | |
# Paths to static files on the cluster | |
# Other static info | |
# Parse sample information | |
args.csv = os.path.abspath(args.csv) | |
# check if exists and is a file | |
if not os.path.isfile(args.csv): | |
logger.error("Sample annotation '%s' does not exist, or user has no read access." % args.csv) | |
sys.exit(1) | |
# read in | |
samples = pd.read_csv(args.csv) | |
# TODO: | |
# Perform checks on the variables given | |
# (e.g. columns existing, bams/peaks existing) | |
# start pipeline | |
projectName = string.join([args.project_name, time.strftime("%Y%m%d-%H%M%S")], sep="_") | |
# track jobs to submit | |
jobs = dict() | |
# diffBind | |
if args.stage in ["all", "diffbind"]: | |
# Submit separate jobs for each genome | |
genome_groups = samples.groupby(["genome"]).groups | |
for genome in genome_groups.keys(): | |
# Separate comparison per IPed factor | |
df = samples.ix[genome_groups[genome]] | |
IP_groups = df.groupby(["ip"]).groups | |
for IP in IP_groups.keys(): | |
if IP.upper() in ["INPUT", "IGG"]: # skip groups with control | |
continue | |
jobName = projectName + "_" + "diffBind_{0}_{1}".format(genome, IP) | |
diffBindSheetFile = os.path.join(projectDir, "runs", jobName + ".csv") | |
print(diffBindSheetFile) | |
# make diffBind csv file, save it | |
empty = makeDiffBindSheet( | |
samples, | |
samples.ix[IP_groups[IP]], | |
os.path.join(dataDir, "peaks"), | |
diffBindSheetFile | |
) | |
# ^^ returns Bool for empty diffBind sheet | |
if not empty: | |
# create job | |
jobCode = slurmHeader( | |
jobName=jobName, | |
output=os.path.join(projectDir, "runs", jobName + ".slurm.log"), | |
queue=args.queue, | |
time=args.time, | |
cpusPerTask=args.cpus, | |
memPerCpu=args.mem, | |
userMail=args.user_mail | |
) | |
jobCode += diffBind( | |
inputCSV=diffBindSheetFile, | |
jobName=jobName, | |
plotsDir=os.path.join(resultsDir, 'plots') | |
) | |
jobCode += slurmFooter() | |
jobs[jobName] = jobCode | |
# Submit job for all samples together (per genome) | |
if args.stage in ["all", "correlations"]: | |
# Submit separate jobs for each genome | |
genome_groups = samples.groupby(["genome"]).groups | |
for genome in genome_groups.keys(): | |
jobName = projectName + "_" + genome + "_sample_correlations" | |
# Separate comparison per IPed factor | |
df = samples.ix[genome_groups[genome]] | |
jobCode = slurmHeader( | |
jobName=jobName, | |
output=os.path.join(projectDir, "runs", jobName + ".slurm.log"), | |
queue=args.queue, | |
time=args.time, | |
cpusPerTask=args.cpus, | |
memPerCpu=args.mem, | |
userMail=args.user_mail | |
) | |
jobCode += plotCorrelations( | |
inputCoverage=[os.path.join(dataDir, "coverage", sampleName + ".cov") for sampleName in list(df['sampleName'])], | |
plotsDir=os.path.join(resultsDir, 'plots') | |
) | |
jobCode += slurmFooter() | |
jobs[jobName] = jobCode | |
# Submit jobs to slurm | |
for jobName, jobCode in jobs.items(): | |
# Output file name | |
jobFile = os.path.join(projectDir, "runs", jobName + ".sh") | |
with open(jobFile, 'w') as handle: | |
handle.write(textwrap.dedent(jobCode)) | |
# Submit to slurm | |
if not args.dry_run: | |
logger.info("Submitting jobs to slurm") | |
status = slurmSubmitJob(jobFile) | |
if status != 0: | |
logger.error("Slurm job '%s' not successfull" % jobFile) | |
sys.exit(1) | |
logger.debug("Project '%s'submission finished successfully." % args.project_name) | |
logger.debug("Finished comparison") | |
return (logger, | |
os.path.join(os.getcwd(), args.project_name + ".log"), | |
os.path.join(projectDir, "runs", args.project_name + ".log")) | |
def getReadType(bamFile, n=10): | |
""" | |
Gets the read type (single, paired) and length of bam file. | |
Returns tuple of (readType=string, readLength=int). | |
""" | |
from collections import Counter | |
p = subprocess.Popen(['samtools', 'view', bamFile], stdout=subprocess.PIPE) | |
# Count paired alignments | |
paired = 0 | |
readLength = Counter() | |
while n > 0: | |
line = p.stdout.next().split("\t") | |
flag = int(line[1]) | |
readLength[len(line[9])] += 1 | |
if 1 & flag: # check decimal flag contains 1 (paired) | |
paired += 1 | |
n -= 1 | |
p.kill() | |
# Get most abundant read readLength | |
readLength = sorted(readLength)[-1] | |
# If at least half is paired, return True | |
if paired > (n / 2): | |
return ("PE", readLength) | |
else: | |
return ("SE", readLength) | |
def parseBowtieStats(statsFile): | |
""" | |
Parses Bowtie2 stats file, returns series with values. | |
""" | |
stats = pd.Series(index=["readCount", "unpaired", "unaligned", "unique", "multiple", "alignmentRate"]) | |
with open(statsFile) as handle: | |
content = handle.readlines() # list of strings per line | |
# total reads | |
try: | |
line = [i for i in range(len(content)) if " reads; of these:" in content[i]][0] | |
stats["readCount"] = re.sub("\D.*", "", content[line]) | |
line = [i for i in range(len(content)) if "were unpaired; of these:" in content[i]][0] | |
stats["unpaired"] = re.sub("\D", "", re.sub("\(.*", "", content[line])) | |
line = [i for i in range(len(content)) if "aligned 0 times" in content[i]][0] | |
stats["unaligned"] = re.sub("\D", "", re.sub("\(.*", "", content[line])) | |
line = [i for i in range(len(content)) if "aligned exactly 1 time" in content[i]][0] | |
stats["unique"] = re.sub("\D", "", re.sub("\(.*", "", content[line])) | |
line = [i for i in range(len(content)) if "aligned >1 times" in content[i]][0] | |
stats["multiple"] = re.sub("\D", "", re.sub("\(.*", "", content[line])) | |
line = [i for i in range(len(content)) if "overall alignment rate" in content[i]][0] | |
stats["alignmentRate"] = re.sub("\D.*", "", content[line]) | |
except IndexError: | |
pass | |
return stats | |
def parseMACSModel(modelFile): | |
""" | |
Parses Bowtie2 stats file, returns series with values. | |
""" | |
out = dict() | |
with open(modelFile) as handle: | |
content = handle.readlines() # list of strings per line | |
try: | |
patt = re.compile("\((.+)\)") | |
line = [i for i in range(len(content)) if "p <- c(" in content[i]][0] | |
out["p"] = [float(i) for i in patt.search(content[line]).group(1).split(",")] | |
line = [i for i in range(len(content)) if "m <- c(" in content[i]][0] | |
out["m"] = [float(i) for i in patt.search(content[line]).group(1).split(",")] | |
line = [i for i in range(len(content)) if "ycorr <- c(" in content[i]][0] | |
out["ycorr"] = [float(i) for i in patt.search(content[line]).group(1).split(",")] | |
line = [i for i in range(len(content)) if "xcorr <- c(" in content[i]][0] | |
out["xcorr"] = [float(i) for i in patt.search(content[line]).group(1).split(",")] | |
line = [i for i in range(len(content)) if "altd <- c(" in content[i]][0] | |
out["altd"] = [float(i) for i in patt.search(content[line]).group(1).split(",")] | |
except IndexError: | |
pass | |
return out | |
def parseQC(sampleName, qcFile): | |
""" | |
Parses QC table produced by phantompeakqualtools (spp) and adds to sample annotation sheet | |
sample quality metrics for each sample. | |
""" | |
series = pd.Series(index=["NSC", "RSC", "qualityTag"]) | |
try: | |
with open(qcFile) as handle: | |
content = handle.readlines() # list of strings per line | |
except: | |
return series | |
try: | |
line = [i for i in range(len(content)) if str(sampleName) in content[i]][0] | |
attrs = content[line].strip().split("\t") | |
series["NSC"] = attrs[-3] | |
series["RSC"] = attrs[-2] | |
series["qualityTag"] = attrs[-1] | |
except IndexError: | |
pass | |
return series | |
def makeDiffBindSheet(samples, df, peaksDir, sheetFile): | |
""" | |
Prepares and saves a diffBind annotation sheet from a pandas.DataFrame annotation with standard columns. | |
""" | |
# Exclude samples without matched control | |
df = df[df["controlSampleName"].notnull()] | |
# Convert numerical fields to str | |
for col in ["numberCells", "technique", "treatment", "patient"]: | |
df[col] = df[col].apply(str) | |
# Merge some annotation fields | |
df.loc[:, "condition"] = df["numberCells"] + "_" + df["technique"] | |
df.loc[:, "treatment"] = df["treatment"] + "_" + df["patient"] | |
# Complete rest of annotation | |
df.loc[:, "ControlID"] = df['controlSampleName'] | |
df.loc[:, "bamControl"] = [samples[samples['sampleName'] == ctrl]["unmappedBam"].tolist()[0] for ctrl in df['controlSampleName']] | |
# TODO: | |
# Change peak path according to narrow/broad | |
df.loc[:, "Peaks"] = [os.path.join(peaksDir, sampleName, sampleName + "_peaks.narrowPeak") for sampleName in df["sampleName"]] | |
df["PeakCaller"] = "macs" | |
# Filter columns used | |
df = df[["sampleName", "cellLine", "ip", "condition", "treatment", | |
"biologicalReplicate", "unmappedBam", "ControlID", "bamControl", "Peaks", "PeakCaller"]] | |
# Rename columns | |
df.columns = ["SampleID", "Tissue", "Factor", "Condition", "Treatment", | |
"Replicate", "bamReads", "ControlID", "bamControl", "Peaks", "PeakCaller"] | |
# If resulting table is not empty, write to disk | |
if not df.empty: | |
# save as csv | |
df.to_csv(sheetFile, index=False) # check if format complies with DiffBind | |
# Return bool of empty | |
return df.empty | |
def slurmHeader(jobName, output, queue="shortq", ntasks=1, time="10:00:00", | |
cpusPerTask=16, memPerCpu=2000, nodes=1, userMail=""): | |
command = """ #!/bin/bash | |
#SBATCH --partition={0} | |
#SBATCH --ntasks={1} | |
#SBATCH --time={2} | |
#SBATCH --cpus-per-task={3} | |
#SBATCH --mem-per-cpu={4} | |
#SBATCH --nodes={5} | |
#SBATCH --job-name={6} | |
#SBATCH --output={7} | |
#SBATCH --mail-type=end | |
#SBATCH --mail-user={8} | |
# Start running the job | |
hostname | |
date | |
""".format(queue, ntasks, time, cpusPerTask, memPerCpu, | |
nodes, jobName, output, userMail) | |
return command | |
def slurmFooter(): | |
command = """ | |
# Job end | |
date | |
""" | |
return command | |
def slurmSubmitJob(jobFile): | |
command = "sbatch %s" % jobFile | |
return os.system(command) | |
def removeFile(fileName): | |
command = """ | |
# Removing file | |
rm {0} | |
""".format(fileName) | |
return command | |
def moveFile(old, new): | |
command = """ | |
# Moving file | |
mv {0} {1} | |
""".format(old, new) | |
return command | |
def makeDir(directory): | |
command = """ | |
# Removing file | |
mkdir -p {0} | |
""".format(directory) | |
return command | |
def mergeBams(inputBams, outputBam): | |
command = """ | |
# Merging bam files from replicates | |
echo "Merging bam files from replicates" | |
java -Xmx4g -jar /cm/shared/apps/picard-tools/1.118/MergeSamFiles.jar \\ | |
USE_THREADING=TRUE \\ | |
{1} \\ | |
OUTPUT={0} | |
""".format(outputBam, (" ".join(["INPUT=%s"] * len(inputBams))) % tuple(inputBams)) | |
return command | |
def fastqc(inputBam, outputDir): | |
command = """ | |
# Measuring sample quality with Fastqc | |
echo "Measuring sample quality with Fastqc" | |
module load java/jdk/1.7.0_65 | |
module load FastQC/0.11.2 | |
fastqc --noextract \\ | |
--outdir {0} \\ | |
{1} | |
""".format(outputDir, inputBam) | |
return command | |
def bam2fastq(inputBam, outputFastq, outputFastq2=None, unpairedFastq=None): | |
command = """ | |
# Convert to Fastq format | |
echo "Converting to Fastq format" | |
java -Xmx4g -jar /cm/shared/apps/picard-tools/1.118/SamToFastq.jar \\ | |
INPUT={0} \\ | |
""".format(inputBam) | |
if outputFastq2 is None and unpairedFastq is None: | |
command += """FASTQ={0} | |
""".format(outputFastq) | |
else: | |
command += """FASTQ={0} \\ | |
SECOND_END_FASTQ={1} \\ | |
UNPAIRED_FASTQ={2} | |
""".format(outputFastq, outputFastq2, unpairedFastq) | |
return command | |
def trimmomatic(inputFastq1, outputFastq1, cpus, adapters, log, | |
inputFastq2=None, outputFastq1unpaired=None, | |
outputFastq2=None, outputFastq2unpaired=None): | |
PE = False if inputFastq2 is None else True | |
pe = "PE" if PE else "SE" | |
command = """ | |
# Trimming adapters from sample | |
echo "Trimming adapters from sample" | |
module load trimmomatic/0.32 | |
java -Xmx4g -jar `which trimmomatic-0.32.jar` """ | |
command += """{0} \\ | |
-threads {1} \\ | |
-trimlog {2} \\ | |
{3} \\ | |
""".format(pe, cpus, log, inputFastq1) | |
if PE: | |
command += """\\ | |
{0} \\ | |
""".format(inputFastq2) | |
command += """\\ | |
{0} \\ | |
""".format(outputFastq1) | |
if PE: | |
command += """\\ | |
{0} \\ | |
{1} \\ | |
{2} \\ | |
""".format(outputFastq1unpaired, | |
outputFastq2, outputFastq2unpaired) | |
command += """\\ | |
ILLUMINACLIP:{0}:1:40:15:8:true \\ | |
LEADING:3 TRAILING:3 \\ | |
SLIDINGWINDOW:4:10 \\ | |
MINLEN:36 | |
""".format(adapters) | |
return command | |
def skewer(inputFastq1, outputPrefix, cpus, adapters, inputFastq2=None): | |
PE = False if inputFastq2 is None else True | |
mode = "pe" if PE else "any" | |
command = """ | |
# Trimming adapters from sample | |
echo "Trimming adapters from sample" | |
skewer --quiet \\ | |
-t {0} \\ | |
-m {1} \\ | |
-x {2} \\ | |
-o {3} \\ | |
{4} """.format(cpus, mode, adapters, outputPrefix, inputFastq1) | |
if inputFastq2 is not None: | |
command += """\\ | |
{0} | |
""".format(inputFastq2) | |
return command | |
def bowtie2Map(inputFastq1, outputBam, log, metrics, genomeIndex, maxInsert, cpus, inputFastq2=None): | |
""" | |
""" | |
outputBam = re.sub("\.bam$", "", outputBam) | |
# Admits 2000bp-long fragments (--maxins option) | |
command = """ | |
# Mapping reads with Bowtie2 | |
echo "Mapping reads with Bowtie2" | |
module load bowtie/2.2.3 | |
module load samtools | |
bowtie2 --very-sensitive -p {0} \\ | |
-x {1} \\ | |
--met-file {2} \\ | |
""".format(cpus, genomeIndex, metrics) | |
if inputFastq2 is None: | |
command += "{0} ".format(inputFastq1) | |
else: | |
command += """--maxins {0} -1 {1} \\ | |
-2 {2} """.format(maxInsert, inputFastq1, inputFastq2) | |
command += """\\ | |
2> {0} | \\ | |
samtools view -S -b - | \\ | |
samtools sort - {1} | |
""".format(log, outputBam) | |
return command | |
def shiftReads(inputBam, genome, outputBam): | |
outputBam = re.sub("\.bam$", "", outputBam) | |
# TODO: | |
# Implement read shifting with HTSeq or Cython | |
command = """ | |
# Shifting read of tagmented sample | |
echo "Shifting read of tagmented sample" | |
module load samtools | |
module load python | |
samtools view -h {0} | \\ | |
python {3}/lib/shift_reads.py {1} | \\ | |
samtools view -S -b - | \\ | |
samtools sort - {2} | |
""".format(inputBam, genome, outputBam, | |
os.path.abspath(os.path.dirname(os.path.realpath(__file__)))) | |
return command | |
def markDuplicates(inputBam, outputBam, metricsFile, tempDir="."): | |
transientFile = re.sub("\.bam$", "", outputBam) + ".dups.nosort.bam" | |
outputBam = re.sub("\.bam$", "", outputBam) | |
command = """ | |
# Marking duplicates with piccard | |
echo "Mark duplicates with piccard" | |
module load samtools | |
java -Xmx4g -jar /cm/shared/apps/picard-tools/1.118/MarkDuplicates.jar \\ | |
INPUT={0} \\ | |
OUTPUT={1} \\ | |
METRICS_FILE={2} \\ | |
VALIDATION_STRINGENCY=LENIENT \\ | |
TMP_DIR={3} | |
# Sort bam file with marked duplicates | |
samtools sort {1} \\ | |
{4} | |
if [[ -s {1} ]] | |
then | |
rm {1} | |
fi | |
""".format(inputBam, transientFile, metricsFile, tempDir, outputBam) | |
return command | |
def removeDuplicates(inputBam, outputBam, cpus=16): | |
command = """ | |
# Removing duplicates with sambamba | |
echo "Remove duplicates with sambamba" | |
sambamba markdup -t {2} -r \\ | |
{0} \\ | |
{1} | |
""".format(inputBam, outputBam, cpus) | |
return command | |
def indexBam(inputBam): | |
command = """ | |
# Indexing bamfile with samtools | |
echo "Indexing bamfile with samtools" | |
module load samtools | |
samtools index {0} | |
""".format(inputBam) | |
return command | |
def peakTools(inputBam, output, plot, cpus): | |
command = """ | |
# Get sample QC metrics | |
echo "Getting sample QC metrics" | |
module load boost/1.57 | |
module load gcc/4.8.2 | |
module load openmpi/gcc/64/1.8.1 | |
run_spp.R -rf -savp -savp={2} -s=0:5:500 \\ | |
-c={0} -out={1} | |
""".format(inputBam, output, plot) | |
return command | |
def qc(): | |
""" | |
$PRESEQ c_curve -B $PROJECTDIR/mapped/$SAMPLE_NAME.trimmed.bowtie2.sorted.shifted.dup.bam \ | |
-o $PROJECTDIR/mapped/$SAMPLE_NAME_qc_c_curve.txt | |
$PRESEQ lc_extrap -e 1e8 -s 2e6 -B $PROJECTDIR/mapped/$SAMPLE_NAME.trimmed.bowtie2.sorted.shifted.dup.bam \ | |
-o $PROJECTDIR/mapped/$SAMPLE_NAME_qc_lc_extrap.txt | |
""" | |
raise NotImplementedError("Function not implemented yet.") | |
def bamToUCSC(inputBam, outputBigWig, genomeSizes, genome, tagmented=False): | |
transientFile = os.path.abspath(re.sub("\.bigWig", "", outputBigWig)) | |
if not tagmented: | |
# TODO: | |
# addjust fragment length dependent on read size and real fragment size | |
# (right now it asssumes 50bp reads with 180bp fragments) | |
command = """ | |
# Making bigWig tracks from bam file | |
echo "making bigWig tracks from bam file" | |
module load bedtools | |
bedtools bamtobed -i {0} | \\ | |
bedtools slop -i stdin -g {1} -s -l 0 -r 130 | \\ | |
python {5}/lib/fix_bedfile_genome_boundaries.py {4} | \\ | |
genomeCoverageBed -i stdin -bg -g {1} \\ | |
> {2}.cov | |
bedGraphToBigWig {2}.cov \\ | |
{1} \\ | |
{3} | |
# remove cov file | |
if [[ -s {2}.cov ]] | |
then | |
rm {2}.cov | |
fi | |
chmod 755 {3} | |
""".format(inputBam, genomeSizes, transientFile, outputBigWig, genome, | |
os.path.abspath(os.path.dirname(os.path.realpath(__file__)))) | |
else: | |
command = """ | |
# Making bigWig tracks from bam file | |
echo "making bigWig tracks from bam file" | |
module load bedtools | |
bedtools bamtobed -i {0} | | |
genomeCoverageBed -5 -i stdin -bg -g {1} \\ | |
> {2}.cov | |
bedGraphToBigWig {2}.cov \\ | |
{1} \\ | |
{3} | |
# remove cov file | |
if [[ -s {2}.cov ]] | |
then | |
rm {2}.cov | |
fi | |
chmod 755 {3} | |
""".format(inputBam, genomeSizes, transientFile, outputBigWig, | |
os.path.abspath(os.path.dirname(os.path.realpath(__file__)))) | |
return command | |
def addTrackToHub(sampleName, trackURL, trackHub, colour, fivePrime=""): | |
command = """ | |
# Adding track to TrackHub | |
echo "Adding track to TrackHub" | |
echo "track type=bigWig name='{0} {1}' description='{0} {1}' """.format(sampleName, fivePrime) | |
command += """height=32 visibility=full maxHeightPixels=32:32:25 bigDataUrl={0} color={1}" >> \\ | |
{2} | |
chmod 755 {2} | |
""".format(trackURL, colour, trackHub) | |
return command | |
def linkToTrackHub(trackHubURL, fileName, genome): | |
db = "org" if genome == "hg19" else "db" # different database call for human | |
genome = "human" if genome == "hg19" else genome # change hg19 to human | |
html = """ | |
<html> | |
<head> | |
<meta http-equiv="refresh" content="0; url=http://genome.ucsc.edu/cgi-bin/hgTracks?""" | |
html += """{db}={genome}&hgt.customText={trackHubURL}" /> | |
</head> | |
</html> | |
""".format(trackHubURL=trackHubURL, genome=genome, db=db) | |
with open(fileName, 'w') as handle: | |
handle.write(textwrap.dedent(html)) | |
def genomeWideCoverage(inputBam, genomeWindows, output): | |
command = """ | |
# Calculate genome-wide coverage | |
echo "Calculating genome-wide coverage" | |
module load bedtools | |
bedtools coverage -abam -counts \\ | |
-a {0} \\ | |
-b {1} \\ | |
> {2} | |
""".format(inputBam, genomeWindows, output) | |
return command | |
def calculateFRiP(inputBam, inputBed, output): | |
command = "cut -f 1,2,3 {0} ".format(inputBed) | |
command += "| bedtools coverage -counts " | |
command += "-abam {0} ".format(inputBam) | |
command += "-b - " | |
command += "| awk '{{sum+=$4}} END {{print sum}}' > {0}".format(output) | |
return command | |
def macs2CallPeaks(treatmentBam, controlBam, outputDir, sampleName, genome, broad=False, plot=True): | |
if genome == "hg19": | |
genome = "hs" | |
elif genome == "mm10": | |
genome = "mm" | |
elif genome == "dr7": | |
raise ValueError("Genome dr7 not yet supported for peak calling with MACS2.") | |
if not broad: | |
command = """ | |
# Call peaks with MACS2 | |
echo "Calling peaks with MACS2" | |
macs2 callpeak \\ | |
-t {0} \\ | |
-c {1} \\ | |
--bw 200 \\ | |
-g {2} -n {3} --outdir {4} | |
""".format(treatmentBam, controlBam, genome, sampleName, outputDir) | |
command = """ | |
# Call peaks with MACS2 | |
echo "Calling peaks with MACS2" | |
macs2 callpeak \\ | |
-t {0} \\ | |
-c {1} \\ | |
--bw 200 \\ | |
--fix-bimodal --extsize 200 \\ | |
-g {2} -n {3} --outdir {4} | |
""".format(treatmentBam, controlBam, genome, sampleName, outputDir) | |
else: | |
# Parameter setting for broad factors according to Nature Protocols (2012) | |
# Vol.7 No.9 1728-1740 doi:10.1038/nprot.2012.101 Protocol (D) for H3K36me3 | |
command = """ | |
# Call peaks with MACS2 | |
echo "Calling peaks with MACS2" | |
macs2 callpeak \\ | |
-t {0} \\ | |
-c {1} \\ | |
--broad --nomodel --extsize 73 --pvalue 1e-3 \\ | |
-g {2} -n {3} --outdir {4} | |
""".format(treatmentBam, controlBam, genome, sampleName, outputDir) | |
if plot and not broad: | |
command += """ | |
# Plot MACS2 crosscorrelation | |
echo "Plotting MACS2 crosscorrelation" | |
module load R | |
Rscript {0}/{1}_model.r | |
mv {2}/{1}_model.pdf {0}/{1}_model.pdf | |
""".format(outputDir, sampleName, os.getcwd()) | |
return command | |
def sppCallPeaks(treatmentBam, controlBam, treatmentName, controlName, outputDir, broad, cpus): | |
broad = "TRUE" if broad else "FALSE" | |
command = """ | |
# Calling peaks with SPP | |
echo "calling peaks with SPP" | |
module load R | |
Rscript {6}/lib/spp_peak_calling.R \\ | |
{0} \\ | |
{1} \\ | |
{2} \\ | |
{3} \\ | |
{4} \\ | |
{5} \\ | |
{6} | |
""".format(treatmentBam, controlBam, treatmentName, controlName, broad, cpus, | |
os.path.abspath(os.path.dirname(os.path.realpath(__file__)))) | |
return command | |
def homerFindMotifs(peakFile, genome, outputDir, size=150, length="8,10,12,14,16", n_motifs=12): | |
command = """ | |
# Find motifs with Homer | |
echo "finding motifs with Homer" | |
findMotifsGenome.pl {0} \\ | |
{1} \\ | |
{2} \\ | |
-mask -size {3} -len {4} -S {5} | |
""".format(peakFile, genome, outputDir, size, length, n_motifs) | |
return command | |
def AnnotatePeaks(peakFile, genome, motifFile, outputBed): | |
command = """ | |
# Annotate peaks with motif score | |
echo "annotating peaks with motif score" | |
annotatePeaks.pl {0} \\ | |
{1} \\ | |
-mask -mscore -m {2} | \\ | |
tail -n +2 | cut -f 1,5,22 \\ | |
> {3} | |
""".format(peakFile, genome, motifFile, outputBed) | |
return command | |
def centerPeaksOnMotifs(peakFile, genome, windowWidth, motifFile, outputBed): | |
command = """ | |
# Center peaks on motif | |
echo "centering peaks on motif" | |
annotatePeaks.pl {0} \\ | |
{1} \\ | |
-size {2} -center {3} | \\ | |
awk -v OFS='\\t' '{{print $2, $3, $4, $1, $6, $5}}' | \\ | |
awk -v OFS='\\t' -F '\t' '{{ gsub("0", "+", $6) ; gsub("1", "-", $6) ; print }}' | \\ | |
python {5}/lib/fix_bedfile_genome_boundaries.py {1} | \\ | |
sortBed > {4} | |
""".format(peakFile, genome, windowWidth, motifFile, outputBed, | |
os.path.abspath(os.path.dirname(os.path.realpath(__file__)))) | |
return command | |
def peakAnalysis(inputBam, peakFile, plotsDir, windowWidth, fragmentsize, | |
genome, n_clusters, strand_specific, duplicates): | |
command = """ | |
# Analyse peak profiles | |
echo "Analysing peak profiles" | |
python {0}/lib/peaks_analysis.py {1} \\ | |
{2} \\ | |
{3} \\ | |
--window-width {4} \\ | |
--fragment-size {5} \\ | |
--genome {6} \\ | |
--n_clusters {7} """.format( | |
os.path.abspath(os.path.dirname(os.path.realpath(__file__))), | |
inputBam, peakFile, plotsDir, windowWidth, fragmentsize, genome, n_clusters | |
) | |
if strand_specific: | |
command += "--strand-specific " | |
if duplicates: | |
command += "--duplicates\n" | |
return command | |
def tssAnalysis(inputBam, tssFile, plotsDir, windowWidth, fragmentsize, genome, | |
n_clusters, strand_specific, duplicates): | |
command = """ | |
# Analyse signal over TSSs | |
echo "Analysing signal over TSSs" | |
python {0}/lib/tss_analysis.py {1} \\ | |
{2} \\ | |
{3} \\ | |
--window-width {4} \\ | |
--fragment-size {5} \\ | |
--genome {6} \\ | |
--n_clusters {7} """.format( | |
os.path.abspath(os.path.dirname(os.path.realpath(__file__))), | |
inputBam, tssFile, plotsDir, windowWidth, fragmentsize, genome, n_clusters | |
) | |
if strand_specific: | |
command += "--strand-specific " | |
if duplicates: | |
command += "--duplicates " | |
return command | |
def footprintAnalysis(): | |
raise NotImplementedError("Function not implemented yet.") | |
def plotCorrelations(inputCoverage, plotsDir): | |
command = """ | |
# Plot correlations | |
echo "plotting correlations" | |
python {2}/lib/correlations.py \\ | |
{0} \\ | |
{1} | |
""".format(plotsDir, | |
" ".join(["%s"] * len(inputCoverage)) % tuple(inputCoverage), | |
os.path.abspath(os.path.dirname(os.path.realpath(__file__))) | |
) | |
return command | |
def diffBind(inputCSV, jobName, plotsDir): | |
command = """ | |
# Detect differential binding with diffBind | |
echo "Detecting differential binding with diffBind" | |
module load R | |
Rscript {3}/lib/diffBind_analysis.R \\ | |
{0} \\ | |
{1} \\ | |
{2} \\ | |
""".format(inputCSV, jobName, plotsDir, | |
os.path.abspath(os.path.dirname(os.path.realpath(__file__)))) | |
return command | |
if __name__ == '__main__': | |
try: | |
main() | |
except KeyboardInterrupt: | |
print("Program canceled by user!") | |
sys.exit(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment