Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
An old pipeline for ChIP-seq/ChIPmentation data processing used back in 2014. Uploading for preservation
#!/usr/bin/env python
"""
ChIP-seq pipeline
"""
from argparse import ArgumentParser
import os
import sys
import logging
import pandas as pd
import numpy as np
import time
import random
import re
import string
import textwrap
import shutil
import subprocess
# TODO: solve pandas chained assignments
pd.options.mode.chained_assignment = None
__author__ = "Andre Rendeiro"
__copyright__ = "Copyright 2014, Andre Rendeiro"
__credits__ = []
__license__ = "GPL2"
__version__ = "0.1"
__maintainer__ = "Andre Rendeiro"
__email__ = "arendeiro@cemm.oeaw.ac.at"
__status__ = "Development"
def main():
# Parse command-line arguments
parser = ArgumentParser(description="ChIP-seq pipeline.")
# Global options
# positional arguments
# optional arguments
parser.add_argument("-r", "--project-root", default="/fhgfs/groups/lab_bock/shared/projects/",
dest="project_root", type=str,
help="""Directory in which the project will reside.
Default=/fhgfs/groups/lab_bock/shared/projects/.""")
parser.add_argument("--html-root", default="/fhgfs/groups/lab_bock/public_html/arendeiro/",
dest="html_root", type=str,
help="""public_html directory in which bigwig files for the project will reside.
Default=/fhgfs/groups/lab_bock/public_html/.""")
parser.add_argument("--url-root", default="http://www.biomedical-sequencing.at/bocklab/arendeiro/",
dest="url_root", type=str,
help="""Url mapping to public_html directory where bigwig files for the project will be accessed.
Default=http://www.biomedical-sequencing.at/bocklab.""")
parser.add_argument("--keep-tmp-files", dest="keep_tmp", action="store_true",
help="Keep intermediary files. If not it will only preserve final files. Default=False.")
parser.add_argument("-c", "--cpus", default=4, dest="cpus",
help="Number of CPUs to use. Default=4.", type=int)
parser.add_argument("-m", "--mem-per-cpu", default=4000, dest="mem",
help="Memory per CPU to use. Default=4000.", type=int)
parser.add_argument("-q", "--queue", default="shortq", dest="queue",
choices=["develop", "shortq", "mediumq", "longq"],
help="Queue to submit jobs to. Default=shortq", type=str)
parser.add_argument("-t", "--time", default="10:00:00", dest="time",
help="Maximum time for jobs to run. Default=10:00:00", type=str)
parser.add_argument("--user-mail", default="", dest="user_mail",
help="User mail address. Default=<submitting user>.", type=str)
parser.add_argument("-l", "--log-level", default="INFO", dest="log_level",
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
help="Logging level. Default=INFO.", type=str)
parser.add_argument("--dry-run", dest="dry_run", action="store_true",
help="Dry run. Assemble commands, but do not submit jobs to slurm. Default=False.")
parser.add_argument("--no-checks", dest="checks", action="store_false",
help="Don't check file existence and integrity. Default=False.")
# Sub commands
subparser = parser.add_subparsers(title="sub-command", dest="command")
# preprocess
preprocess_subparser = subparser.add_parser("preprocess")
preprocess_subparser.add_argument(dest="project_name", help="Project name.", type=str)
preprocess_subparser.add_argument(dest="csv", help="CSV file with sample annotation.", type=str)
preprocess_subparser.add_argument("-s", "--stage", default="all", dest="stage",
choices=["all", "bam2fastq", "fastqc", "trim", "mapping",
"shiftreads", "markduplicates", "removeduplicates",
"indexbam", "maketracks", "coverage"],
help="Run only these stages. Default=all.", type=str)
preprocess_subparser.add_argument("--trimmer", default="skewer", choices=["trimmomatic", "skewer"],
dest="trimmer", help="Trimmer to use. Default=skewer.", type=str)
preprocess_subparser.add_argument("-i", "--max-insert-size", default=2000,
dest="maxinsert",
help="Maximum allowed insert size allowed for paired end mates. Default=2000.",
type=int)
preprocess_subparser.add_argument("--window-size", default=1000, dest="windowsize",
help="Window size used for genome-wide correlations. Default=1000.",
type=int)
# stats
stats_subparser = subparser.add_parser("stats")
stats_subparser.add_argument(dest="project_name", help="Project name.", type=str)
stats_subparser.add_argument(dest="csv", help="CSV file with sample annotation.", type=str)
# analyse
analyse_subparser = subparser.add_parser("analyse")
analyse_subparser.add_argument(dest="project_name", help="Project name.", type=str)
analyse_subparser.add_argument(dest="csv", help="CSV file with sample annotation.", type=str)
analyse_subparser.add_argument("-s", "--stage", default="all", dest="stage",
choices=["all", "qc", "callpeaks", "findmotifs", "centerpeaks",
"annotatepeaks", "peakanalysis", "tssanalysis", "footprints", "frip"],
help="Run only these stages. Default=all.", type=str)
analyse_subparser.add_argument("--peak-caller", default="macs2", choices=["macs2", "spp"],
dest="peak_caller", help="Peak caller to use. Default=macs2.", type=str)
analyse_subparser.add_argument("--peak-window-width", default=2000,
dest="peak_window_width",
help="Width of window around peak motifs. Default=2000.",
type=int)
# compare
compare_subparser = subparser.add_parser("compare")
compare_subparser.add_argument(dest="project_name", help="Project name.", type=str)
compare_subparser.add_argument(dest="csv", help="CSV file with sample annotation.", type=str)
compare_subparser.add_argument("-s", "--stage", default="all", dest="stage",
choices=["all", "diffbind", "correlations"],
help="Run only these stages. Default=all.", type=str)
compare_subparser.add_argument("--duplicates", dest="duplicates", action="store_true",
help="Allow duplicates in coorelation analysis. Default=False.")
compare_subparser.add_argument("--genome-window-width", default=1000,
dest="genome_window_width",
help="Width of window to make genome-wide correlations. Default=1000.",
type=int)
# Parse
args = parser.parse_args()
# Logging
logger = logging.getLogger(__name__)
levels = {
"DEBUG": logging.DEBUG,
"INFO": logging.INFO,
"WARNING": logging.WARNING,
"ERROR": logging.ERROR,
"CRITICAL": logging.CRITICAL
}
logger.setLevel(levels[args.log_level])
# create a file handler
# (for now in current working dir, in the end copy log to projectDir)
handler = logging.FileHandler(os.path.join(os.getcwd(), args.project_name + ".log"))
handler.setLevel(logging.INFO)
# format logger
formatter = logging.Formatter(fmt='%(levelname)s: %(asctime)s - %(message)s', datefmt='%Y/%m/%d %H:%M:%S')
handler.setFormatter(formatter)
logger.addHandler(handler)
# create a stout handler
stdout = logging.StreamHandler(sys.stdout)
stdout.setLevel(logging.ERROR)
formatter = logging.Formatter(fmt='%(levelname)s: %(asctime)s - %(message)s', datefmt='%Y/%m/%d %H:%M:%S')
stdout.setFormatter(formatter)
logger.addHandler(stdout)
# Start main function
if args.command == "preprocess":
logger, fr, to = preprocess(args, logger)
elif args.command == "stats":
logger, fr, to = readStats(args, logger)
elif args.command == "analyse":
logger, fr, to = analyse(args, logger)
elif args.command == "compare":
logger, fr, to = compare(args, logger)
# Exit
logger.info("Finished and exiting.")
# Copy log to projectDir
shutil.copy2(fr, to)
sys.exit(1)
def checkProjectDirs(args, logger):
# Directories and paths
# check args.project_root exists and user has write access
args.project_root = os.path.abspath(args.project_root)
logger.debug("Checking if %s directory exists and is writable." % args.project_root)
if not os.access(args.project_root, os.W_OK):
logger.error("%s does not exist, or user has no write access.\n\
Use option '-r' '--project-root' to set a non-default project root path." % args.project_root)
sys.exit(1)
# check args.html_root exists and user has write access
htmlDir = os.path.abspath(args.html_root)
logger.debug("Checking if %s directory exists and is writable." % args.project_root)
if not os.access(htmlDir, os.W_OK):
logger.error("%s does not exist, or user has no write access.\n\
Use option '--html-root' to set a non-default html root path." % htmlDir)
sys.exit(1)
# project directories
projectDir = os.path.join(args.project_root, args.project_name)
dataDir = os.path.join(projectDir, "data")
resultsDir = os.path.join(projectDir, "results")
# make relative project dirs
dirs = [
projectDir,
os.path.join(projectDir, "runs"),
dataDir,
os.path.join(dataDir, "fastq"),
os.path.join(dataDir, "fastqc"),
os.path.join(dataDir, "raw"),
os.path.join(dataDir, "mapped"),
os.path.join(dataDir, "coverage"),
os.path.join(dataDir, "peaks"),
os.path.join(dataDir, "motifs"),
resultsDir,
os.path.join(resultsDir, "plots"),
os.path.join(resultsDir, "plots", "cdt"),
os.path.join(resultsDir, "plots", "pickles"),
htmlDir,
os.path.join(args.html_root, args.project_name),
os.path.join(args.html_root, args.project_name, "bigWig")
]
for d in dirs:
if not os.path.exists(d):
os.makedirs(d)
# chmod of paths to public_html folder
html = [
htmlDir,
os.path.join(args.html_root, args.project_name),
os.path.join(args.html_root, args.project_name, "bigWig")
]
for d in html:
try:
os.chmod(d, 0755)
except OSError:
logger.error("cannot change folder's mode: %s" % d)
continue
htmlDir = os.path.join(args.html_root, args.project_name, "bigWig")
urlRoot = args.url_root + args.project_name + "/bigWig/"
return (htmlDir, projectDir, dataDir, resultsDir, urlRoot)
def getReplicates(samples):
"""
Returns new sample annotation sheet with provided samples, plus biological replicates
(merged technical replicates) and merged biological replicates.
samples - a pandas.DataFrame with sample info.
"""
# ignore some fields in the annotation sheet
varsName = ["cellLine", "numberCells", "technique", "ip", "patient", "treatment", "biologicalReplicate", "technicalReplicate", "genome"]
# get sample names
samples["sampleName"] = ["_".join([str(j) for j in samples.ix[i][varsName]]) for i in samples.index]
samplesMerged = samples.copy()
# get merged technical replicates -> biological replicates
variables = ["cellLine", "numberCells", "technique", "ip", "patient", "treatment", "biologicalReplicate", "genome"]
for key, values in samples.groupby(variables).groups.items():
rep = samples.ix[values][varsName].reset_index(drop=True).ix[0]
if len(values) > 1:
rep["experimentName"] = np.nan
rep["technicalReplicate"] = 0
rep["unmappedBam"] = samples.ix[values]["unmappedBam"].tolist()
rep["sampleName"] = "_".join([str(i) for i in rep[varsName]])
# append biological replicate to sample annotation
samplesMerged = samplesMerged.append(rep, ignore_index=True)
# get merged biological replicates -> merged biological replicates
variables = ["cellLine", "numberCells", "technique", "ip", "patient", "treatment", "genome"]
for key, values in samples.groupby(variables).groups.items():
rep = samples.ix[values][varsName].reset_index(drop=True).ix[0]
if len(values) > 1:
# check samples in group are from different biological replicates
if len(samples.ix[samples.groupby(variables).groups[key]]['biologicalReplicate'].unique()) > 1:
rep["experimentName"] = np.nan
rep["biologicalReplicate"] = 0
rep["technicalReplicate"] = 0
rep["unmappedBam"] = samples.ix[values]["unmappedBam"].tolist()
rep["sampleName"] = "_".join([str(i) for i in rep[varsName]])
# append biological replicate to sample annotation
samplesMerged = samplesMerged.append(rep, ignore_index=True)
# replace sample name with list of sample names for only one sample (original case)
for i in range(len(samplesMerged)):
if type(samplesMerged["unmappedBam"][i]) is not list:
samplesMerged["unmappedBam"][i] = [samplesMerged["unmappedBam"][i]]
return samplesMerged.sort(["sampleName"])
def preprocess(args, logger):
"""
This takes unmapped Bam files and makes trimmed, aligned, indexed (and shifted if necessary)
Bam files along with a UCSC browser track.
"""
logger.info("Starting sample preprocessing.")
logger.debug("Checking project directories exist and creating if not.")
htmlDir, projectDir, dataDir, resultsDir, urlRoot = checkProjectDirs(args, logger)
# Paths to static files on the cluster
genomeFolder = "/fhgfs/prod/ngs_resources/genomes/"
genomeIndexes = {
"hg19": "/home/arendeiro/reference/Homo_sapiens/forBowtie2/hg19",
"mm10": os.path.join(genomeFolder, "mm10/forBowtie2/mm10/index_woRandom/mm10"),
"dr7": os.path.join(genomeFolder, "dr7/forBowtie2/dr7")
}
genomeSizes = {
"hg19": "/fhgfs/groups/lab_bock/arendeiro/share/hg19.chrom.sizes",
"mm10": "/fhgfs/groups/lab_bock/arendeiro/share/mm10.chrom.sizes",
"dr7": "/fhgfs/groups/lab_bock/arendeiro/share/danRer7.chrom.sizes"
}
genomeWindows = {
"hg19": "/fhgfs/groups/lab_bock/arendeiro/share/hg19.genomeWindows_1kb.bed",
"mm10": "/fhgfs/groups/lab_bock/arendeiro/share/mm10.genomeWindows_1kb.bed",
"dr7": "/fhgfs/groups/lab_bock/arendeiro/share/danRer7.genomeWindows_1kb.bed"
}
adapterFasta = "/fhgfs/groups/lab_bock/shared/cm.fa"
# Other static info
tagment = [
"DNASE", "DNASESEQ", "DNASE-SEQ", "DHS", "DHS-SEQ", "DHSSEQ",
"ATAC", "ATAC-SEQ", "ATACSEQ",
"CM"
]
# from the ggplot2 color blind pallete
# #999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"
# shortly: green,red=active, orange=transcription, blue=repression, yellow=potential
colours = {
"IGG": "153,153,153", "INPUT": "153,153,153", # grey
"H3K36ME1": "230,159,0", "H3K36ME2": "230,159,0", "H3K36ME3": "230,159,0", # orange
"H3K4ME3": "0,158,115", # bluish green
"H3K4ME1": "120,114,33", "H3K14ac": "120,114,33", # yellow
"H3K27ME1": "0,114,178", "H3K27ME2": "0,114,178", "H3K27ME3": "0,114,178", # blue
"H3K9ME1": "86,180,233", "H3K9ME2": "86,180,233", "H3K9ME3": "86,180,233", # sky blue
"H3AC": "213,94,0", "H3K9AC": "213,94,0", "H3K27AC": "213,94,0", "H3K56AC": "213,94,0", "H3K56AC": "213,94,0", # vermillion
"H3K79ME1": "204,121,167", "H3K79ME2": "204,121,167", "H3K79ME3": "204,121,167", # reddish purple
"ATAC": "0,158,115",
"DNASE": "0,158,115",
}
colour_gradient = [ # 10 colour gradient from red to blue
"155,3,5", "140,2,18", "125,2,31", "110,2,44", "96,2,57",
"81,2,70", "66,2,83", "52,2,96", "37,2,109", "22,2,122"
]
# Parse sample information
args.csv = os.path.abspath(args.csv)
# check if exists and is a file
if not os.path.isfile(args.csv):
logger.error("Sample annotation '%s' does not exist, or user has no read access." % args.csv)
sys.exit(1)
# read in
samples = pd.read_csv(args.csv)
# TODO:
# Perform checks on the variables given
# (e.g. genome in genomeIndexes)
# start pipeline
projectName = string.join([args.project_name, time.strftime("%Y%m%d-%H%M%S")], sep="_")
# Get biological replicates and merged biological replicates
logger.debug("Checking which samples should be merged.")
samplesMerged = getReplicates(samples.copy())
# ^^ this is the new annotation sheet as well
samplesMerged['readType'] = None
samplesMerged['readLength'] = None
# add field for manual sample pairing
samplesMerged["controlSampleName"] = None
# Preprocess samples
for sample in range(len(samplesMerged)):
# Get sample name
sampleName = samplesMerged["sampleName"][sample]
# get jobname
jobName = projectName + "_" + sampleName
# check if sample is paired end
PE = False
# check if sample is tagmented or not:
tagmented = True
# get intermediate names for files
if len(samplesMerged["unmappedBam"][sample]) == 1:
unmappedBam = samplesMerged["unmappedBam"][sample][0]
else:
unmappedBam = os.path.join(dataDir, "raw", sampleName + ".bam")
if not tagmented:
bam = os.path.join(dataDir, "mapped", sampleName + ".trimmed.bowtie2")
else:
bam = os.path.join(dataDir, "mapped", sampleName + ".trimmed.bowtie2.shifted")
# get colour for tracks
if str(samplesMerged["ip"][sample]).upper() in colours.keys():
colour = colours[samplesMerged["ip"][sample].upper()]
else:
if samplesMerged["technique"][sample] in ["ATAC", "ATACSEQ", "ATAC-SEQ"]:
colour = colours["ATAC"]
elif samplesMerged["technique"][sample] in ["DNASE", "DNASESEQ", "DNASE-SEQ"]:
colour = colours["DNASE"]
else:
colour = random.sample(colour_gradient, 1)[0] # pick one randomly
# keep track of temporary files
tempFiles = list()
# assemble commands
# get job header
jobCode = slurmHeader(
jobName=jobName,
output=os.path.join(projectDir, "runs", jobName + ".slurm.log"),
queue=args.queue,
time=args.time,
cpusPerTask=args.cpus,
memPerCpu=args.mem,
userMail=args.user_mail
)
if args.stage in ["all", "trim"]:
# TODO:
# Change absolute path to something usable by everyone or to an option.
if args.trimmer == "trimmomatic":
jobCode += trimmomatic(
inputFastq1=samplesMerged["unmappedBam"][sample],
inputFastq2=os.path.join(dataDir, "fastq", sampleName + ".2.fastq") if PE else None,
outputFastq1=os.path.join(dataDir, "fastq", sampleName + ".trimmed.fastq") if not PE else os.path.join(dataDir, "fastq", sampleName + ".1.trimmed.fastq"),
outputFastq1unpaired=os.path.join(dataDir, "fastq", sampleName + ".1_unpaired.trimmed.fastq") if PE else None,
outputFastq2=os.path.join(dataDir, "fastq", sampleName + ".2.trimmed.fastq") if PE else None,
outputFastq2unpaired=os.path.join(dataDir, "fastq", sampleName + ".2_unpaired.trimmed.fastq") if PE else None,
cpus=args.cpus,
adapters=adapterFasta,
log=os.path.join(resultsDir, sampleName + ".trimlog.txt")
)
if not PE:
tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".trimmed.fastq"))
else:
tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".1.trimmed.fastq"))
tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".2.trimmed.fastq"))
tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".1_unpaired.trimmed.fastq"))
tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".2_unpaired.trimmed.fastq"))
if args.trimmer == "skewer":
jobCode += skewer(
inputFastq1=samplesMerged["unmappedBam"][sample] if not PE else os.path.join(dataDir, "fastq", sampleName + ".1.fastq"),
inputFastq2=os.path.join(dataDir, "fastq", sampleName + ".2.fastq") if PE else None,
outputPrefix=os.path.join(dataDir, "fastq", sampleName),
cpus=args.cpus,
adapters=adapterFasta
)
# move files to have common name
if not PE:
jobCode += moveFile(
os.path.join(dataDir, "fastq", sampleName + "-trimmed.fastq"),
os.path.join(dataDir, "fastq", sampleName + ".trimmed.fastq")
)
tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".trimmed.fastq"))
else:
jobCode += moveFile(
os.path.join(dataDir, "fastq", sampleName + "-trimmed-pair1.fastq"),
os.path.join(dataDir, "fastq", sampleName + ".1.trimmed.fastq")
)
jobCode += moveFile(
os.path.join(dataDir, "fastq", sampleName + "-trimmed-pair2.fastq"),
os.path.join(dataDir, "fastq", sampleName + ".2.trimmed.fastq")
)
tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".1.trimmed.fastq"))
tempFiles.append(os.path.join(dataDir, "fastq", sampleName + ".2.trimmed.fastq"))
# move log to results dir
jobCode += moveFile(
os.path.join(dataDir, "fastq", sampleName + "-trimmed.log"),
os.path.join(resultsDir, sampleName + ".trimlog.txt")
)
if args.stage in ["all", "mapping"]:
if samplesMerged["genome"][sample] not in genomeIndexes:
logger.error("Sample %s has unsuported genome index: %s" % (sampleName, samplesMerged["genome"][sample]))
sys.exit(1)
jobCode += bowtie2Map(
inputFastq1=os.path.join(dataDir, "fastq", sampleName + ".1.trimmed.fastq") if PE else os.path.join(dataDir, "fastq", sampleName + ".trimmed.fastq"),
inputFastq2=os.path.join(dataDir, "fastq", sampleName + ".2.trimmed.fastq") if PE else None,
outputBam=os.path.join(dataDir, "mapped", sampleName + ".trimmed.bowtie2.bam"),
log=os.path.join(resultsDir, sampleName + ".alnRates.txt"),
metrics=os.path.join(resultsDir, sampleName + ".alnMetrics.txt"),
genomeIndex=genomeIndexes[samplesMerged["genome"][sample]],
maxInsert=args.maxinsert,
cpus=args.cpus
)
tempFiles.append(os.path.join(dataDir, "mapped", sampleName + ".trimmed.bowtie2.bam"))
if args.stage in ["all", "shiftreads"]:
if tagmented:
jobCode += shiftReads(
inputBam=os.path.join(dataDir, "mapped", sampleName + ".trimmed.bowtie2.bam"),
genome=samplesMerged["genome"][sample],
outputBam=bam + ".bam"
)
if args.stage in ["all", "markduplicates"]:
jobCode += markDuplicates(
inputBam=bam + ".bam",
outputBam=bam + ".dups.bam",
metricsFile=os.path.join(resultsDir, sampleName + ".duplicates.txt") #
# tempDir=
)
if args.stage in ["all", "removeduplicates"]:
jobCode += removeDuplicates(
inputBam=bam + ".dups.bam",
outputBam=bam + ".nodups.bam",
cpus=args.cpus
)
if args.stage in ["all", "indexbam"]:
jobCode += indexBam(
inputBam=bam + ".dups.bam"
)
jobCode += indexBam(
inputBam=bam + ".nodups.bam"
)
if args.stage in ["all", "maketracks"]:
# right now tracks are only made for bams with duplicates
jobCode += bamToUCSC(
inputBam=bam + ".dups.bam",
outputBigWig=os.path.join(htmlDir, sampleName + ".bigWig"),
genomeSizes=genomeSizes[samplesMerged["genome"][sample]],
genome=samplesMerged["genome"][sample],
tagmented=False
)
jobCode += addTrackToHub(
sampleName=sampleName,
trackURL=urlRoot + sampleName + ".bigWig",
trackHub=os.path.join(htmlDir, "trackHub_{0}.txt".format(samplesMerged["genome"][sample])),
colour=colour
)
if tagmented:
jobCode += bamToUCSC(
inputBam=bam + ".dups.bam",
outputBigWig=os.path.join(htmlDir, sampleName + ".5prime.bigWig"),
genomeSizes=genomeSizes[samplesMerged["genome"][sample]],
genome=samplesMerged["genome"][sample],
tagmented=True
)
jobCode += addTrackToHub(
sampleName=sampleName,
trackURL=urlRoot + sampleName + ".5prime.bigWig",
trackHub=os.path.join(htmlDir, "trackHub_{0}.txt".format(samplesMerged["genome"][sample])),
fivePrime="5prime",
colour=colour
)
linkToTrackHub(
trackHubURL="{0}/{1}/bigWig/trackHub_{2}.txt".format(args.url_root, args.project_name, samplesMerged["genome"][sample]),
fileName=os.path.join(projectDir, "ucsc_tracks_{0}.html".format(samplesMerged["genome"][sample])),
genome=samplesMerged["genome"][sample]
)
if args.stage in ["all", "coverage"]:
jobCode += genomeWideCoverage(
inputBam=bam + ".nodups.bam",
genomeWindows="/fhgfs/groups/lab_bock/arendeiro/share/hg19.genomeWindows_1kb.filteredMappability50mer.filteredBlacklisted.bed",
output=os.path.join(dataDir, "coverage", sampleName + ".cov")
)
# Remove intermediary files
if args.stage == "all" and not args.keep_tmp:
logger.debug("Removing intermediary files")
for fileName in tempFiles:
jobCode += removeFile(fileName)
# Submit job to slurm
# Get concatenated string with code from all modules
jobCode += slurmFooter()
# Output file name
jobFile = os.path.join(projectDir, "runs", jobName + ".sh")
with open(jobFile, 'w') as handle:
handle.write(textwrap.dedent(jobCode))
# Submit to slurm
if not args.dry_run:
logger.info("Submitting job to slurm: %s" % jobName)
status = slurmSubmitJob(jobFile)
if status != 0:
logger.error("Slurm job '%s' not successfull" % jobFile)
sys.exit(1)
logger.debug("Project '%s'submission finished successfully." % args.project_name)
# Write location of bam file in merged samples annotation sheet
samplesMerged['unmappedBam'][sample] = bam + ".dups.bam"
# write original annotation sheet to project folder
samples.to_csv(os.path.join(projectDir, args.project_name + ".annotation_sheet.csv"), index=False)
# write annotation sheet with biological replicates to project folder
samplesMerged.to_csv(
os.path.join(projectDir, args.project_name + ".replicates.annotation_sheet.csv"),
index=False
)
logger.debug("Finished preprocessing")
return (logger,
os.path.join(os.getcwd(), args.project_name + ".log"),
os.path.join(projectDir, "runs", args.project_name + ".log"))
def readStats(args, logger):
"""
Given an annotation sheet with replicates, gets number of reads mapped, duplicates, etc...
"""
logger.info("Starting sample read stats.")
logger.debug("Checking project directories exist and creating if not.")
htmlDir, projectDir, dataDir, resultsDir, urlRoot = checkProjectDirs(args, logger)
# Parse sample information
args.csv = os.path.abspath(args.csv)
# check if exists and is a file
if not os.path.isfile(args.csv):
logger.error("Sample annotation '%s' does not exist, or user has no read access." % args.csv)
sys.exit(1)
# read in
samples = pd.read_csv(args.csv)
variables = ["cellLine", "numberCells", "technique", "ip", "patient",
"treatment", "biologicalReplicate", "technicalReplicate", "genome"]
cols = ["readCount", "unpaired", "unaligned", "unique", "multiple", "alignmentRate"]
for col in cols:
samples[col] = None
cols = ["unpairedReadsExamined", "readPairsExamined", "unmappedReads", "unpairedReadDuplicates",
"readPairDuplicates", "readPairOpticalDuplicates", "percentDuplication", "estimatedLibrarySize"]
samples["totalReads"] = None
for col in cols:
samples[col] = None
if "peakFile" in samples.columns:
samples["NSC"] = None
samples["RSC"] = None
samples["peakNumber"] = None
samples["FRiP"] = None
for sample in xrange(len(samples)):
# Sample name
# if sampleName is not provided, use a concatenation of several variable (excluding longest)
if str(samples["sampleName"][sample]) != "nan":
sampleName = samples["sampleName"][sample]
else:
sampleName = string.join([str(samples[var][sample]) for var in variables], sep="_")
logger.debug("No sample name provided, using concatenation of variables supplied")
# Get alignment stats
try:
# get stats into a series
alnStats = parseBowtieStats(os.path.join(resultsDir, sampleName + ".alnRates.txt"))
# add to dataFrame
samples.loc[sample, alnStats.index.tolist()] = alnStats
except:
logger.warn("Record with alignment rates is empty or not found for sample %s" % sampleName)
pass
# Get duplicate stats
try:
dups = pd.read_csv(
os.path.join(resultsDir, sampleName + ".duplicates.txt"),
sep="\t",
comment="#",
header=0
)
# drop if empty
dups.dropna(thresh=len(dups.columns) - 1, inplace=True)
# Add values to sample sheet
for col in range(len(cols)):
samples.loc[sample, cols[col]] = dups.drop(["LIBRARY"], axis=1).ix[0][col]
except:
logger.warn("Record with duplicates is empty or not found for sample %s" % sampleName)
pass
# Get NSC and RSC
qcFile = os.path.join(resultsDir, "sample_QC.tsv")
qc = parseQC(sampleName, qcFile)
samples.loc[sample, "NSC"] = qc["NSC"]
samples.loc[sample, "RSC"] = qc["RSC"]
samples.loc[sample, "qualityTag"] = qc["qualityTag"]
# Count peak number (if peaks in sheet)
if "peakFile" in samples.columns:
# and if sample has peaks
if str(samples["peakFile"][sample]) != "nan":
proc = subprocess.Popen(["wc", "-l", samples["peakFile"][sample]], stdout=subprocess.PIPE)
out, err = proc.communicate()
samples.loc[sample, "peakNumber"] = re.sub("\D.*", "", out)
# Get FRiP from file (if exists) and add to sheet
if "peakFile" in samples.columns:
# if sample has peaks
if str(samples["peakFile"][sample]) != "nan":
try:
with open(os.path.join(resultsDir, sampleName + "_FRiP.txt"), "r") as handle:
content = handle.readlines()
readsInPeaks = int(re.sub("\D", "", content[0]))
mappedReads = samples["readCount"][sample] - samples["unaligned"][sample]
samples.loc[sample, "FRiP"] = readsInPeaks / mappedReads
except:
logger.warn("Record with FRiP value is empty or not found for sample %s" % sampleName)
pass
# convert duplicate rate to percentage
samples.loc[:, 'percentDuplication'] = samples['percentDuplication'] * 100
# write annotation sheet with biological replicates
samples.to_csv(os.path.join(projectDir, args.project_name + ".read_stats.csv"), index=False)
logger.debug("Finished getting read statistics.")
return (logger,
os.path.join(os.getcwd(), args.project_name + ".log"),
os.path.join(projectDir, "runs", args.project_name + ".log"))
def analyse(args, logger):
logger.info("Starting sample analysis.")
logger.debug("Checking project directories exist and creating if not.")
htmlDir, projectDir, dataDir, resultsDir, urlRoot = checkProjectDirs(args, logger)
# Paths to static files on the cluster
# Other static info
tagment = [
"DNASE", "DNASESEQ", "DNASE-SEQ", "DHS", "DHS-SEQ", "DHSSEQ",
"ATAC", "ATAC-SEQ", "ATACSEQ",
"CM"
]
histones = ["H2A", "H2B", "H3", "H4"]
broadFactors = [
"H3K27ME1", "H3K27ME2", "H3K27ME3",
"H3K36ME1", "H3K36ME2", "H3K36ME3",
"H3K9ME1", "H3K9ME2", "H3K9ME3",
"H3K72ME1", "H3K72ME2", "H3K72ME3"
]
tssFiles = {
"hg19": "/fhgfs/groups/lab_bock/arendeiro/share/GRCh37_hg19_refSeq.tss.bed",
"mm10": "/fhgfs/groups/lab_bock/arendeiro/share/GRCm38_mm10_refSeq.tss.bed",
"dr7": "/fhgfs/groups/lab_bock/arendeiro/share/GRCh37_hg19_refSeq.tss.bed"
}
# Parse sample information
args.csv = os.path.abspath(args.csv)
# check if exists and is a file
if not os.path.isfile(args.csv):
logger.error("Sample annotation '%s' does not exist, or user has no read access." % args.csv)
sys.exit(1)
# read in
samples = pd.read_csv(args.csv)
samples["controlSampleunmappedBam"] = None
samples["peakFile"] = None
# TODO:
# Perform checks on the variables given
# (e.g. columns existing, bams existing)
# start pipeline
projectName = string.join([args.project_name, time.strftime("%Y%m%d-%H%M%S")], sep="_")
# Preprocess samples
variables = ["cellLine", "numberCells", "technique", "ip", "patient",
"treatment", "biologicalReplicate", "technicalReplicate", "genome"]
# track jobs to submit
jobs = dict()
for sample in xrange(len(samples)):
# Sample name
# if sampleName is not provided, use a concatenation of several variable (excluding longest)
if str(samples["sampleName"][sample]) != "nan":
sampleName = samples["sampleName"][sample]
else:
sampleName = string.join([str(samples[var][sample]) for var in variables], sep="_")
logger.debug("No sample name provided, using concatenation of variables supplied")
# Pair with control
control = False
# get file path of sample which this sample's controlName is pointing to
ctrlField = samples[samples['sampleName'].isin([samples['controlSampleName'][sample]])]['unmappedBam']
# ^^ doesn't mean it has something (i.e. is empty if no control sample is indicated)
# if there is only one record, use that as control
if len(ctrlField) == 1 and type(ctrlField.values[0]) == str:
control = True
controlName = samples.ix[sample]["controlSampleName"]
controlBam = os.path.abspath(ctrlField.values[0])
samples["controlSampleunmappedBam"][sample] = controlBam
# skip samples without matched control
if not control:
continue
# check if sample is tagmented or not:
tagmented = True if samples["technique"][sample] in tagment else False
if not tagmented:
nodupsbam = os.path.join(dataDir, "mapped", sampleName + ".trimmed.bowtie2.nodups.bam")
else:
nodupsbam = os.path.join(dataDir, "mapped", sampleName + ".trimmed.bowtie2.shifted.nodups.bam")
# Is it a histone?
histone = True if any([i in samples["ip"][sample] for i in histones]) else False
# Is it a broad factor?
broad = False if samples["ip"][sample].upper() not in broadFactors else True
# peakFile
peakFile = os.path.join(
dataDir, "peaks", sampleName,
sampleName + "_peaks" + (".narrowPeak" if not broad else ".broadPeak")
)
samples["peakFile"][sample] = peakFile
jobName = projectName + "_" + sampleName
tempFiles = list()
# assemble commands
jobCode = slurmHeader(
jobName=jobName,
output=os.path.join(projectDir, "runs", jobName + ".slurm.log"),
queue=args.queue,
time=args.time,
cpusPerTask=args.cpus,
memPerCpu=args.mem,
userMail=args.user_mail
)
if args.stage in ["all", "qc"]:
jobCode += peakTools(
inputBam=nodupsbam,
output=os.path.join(resultsDir, "sample_QC.tsv"),
plot=os.path.join(resultsDir, sampleName + "_QC.pdf"),
cpus=args.cpus
)
if args.stage in ["all", "callpeaks"] and control:
if args.peak_caller == "macs2":
# make dir for output
if not os.path.exists(os.path.join(dataDir, "peaks", sampleName)):
os.makedirs(os.path.join(dataDir, "peaks", sampleName))
# For point-source factors use default settings
# For broad factors use broad settings
jobCode += macs2CallPeaks(
treatmentBam=os.path.abspath(samples['unmappedBam'][sample]),
controlBam=controlBam,
outputDir=os.path.join(dataDir, "peaks", sampleName),
sampleName=sampleName,
genome=samples['genome'][sample],
broad=True if broad else False
)
elif args.peak_caller == "spp":
# For point-source factors use default settings
# For broad factors use broad settings
jobCode += sppCallPeaks(
treatmentBam=os.path.abspath(samples['unmappedBam'][sample]),
controlBam=controlBam,
treatmentName=sampleName,
controlName=controlName,
outputDir=os.path.join(dataDir, "peaks", sampleName),
broad=True if broad else False,
cpus=args.cpus
)
if args.stage in ["all", "findmotifs"] and control:
# make dir for motifs
if not os.path.exists(os.path.join(dataDir, "motifs", sampleName)):
os.makedirs(os.path.join(dataDir, "motifs", sampleName))
if not histone:
# For TFs, find the "self" motif
jobCode += homerFindMotifs(
peakFile=peakFile,
genome=samples["genome"][sample],
outputDir=os.path.join(dataDir, "motifs", sampleName),
size="50",
length="8,10,12,14,16",
n_motifs=8
)
# For TFs, find co-binding motifs (broader region)
jobCode += homerFindMotifs(
peakFile=peakFile,
genome=samples["genome"][sample],
outputDir=os.path.join(dataDir, "motifs", sampleName + "_cobinders"),
size="200",
length="8,10,12,14,16",
n_motifs=12
)
else:
# For histones, use a broader region to find motifs
jobCode += homerFindMotifs(
peakFile=peakFile,
genome=samples["genome"][sample],
outputDir=os.path.join(dataDir, "motifs", sampleName),
size="1000",
length="8,10,12,14,16",
n_motifs=20
)
if args.stage in ["all", "centerpeaks"] and control:
# TODO:
# right now this assumes peaks were called with MACS2
# figure a way of magetting the peak files withough using the peak_caller option
# for that would imply taht option would be required when selecting this stage
jobCode += centerPeaksOnMotifs(
peakFile=peakFile,
genome=samples["genome"][sample],
windowWidth=args.peak_window_width,
motifFile=os.path.join(dataDir, "motifs", sampleName, "homerResults", "motif1.motif"),
outputBed=os.path.join(dataDir, "peaks", sampleName, sampleName + "_peaks.motifCentered.bed")
)
if args.stage in ["all", "annotatepeaks"] and control:
# TODO:
# right now this assumes peaks were called with MACS2
# figure a way of getting the peak files withough using the peak_caller option
# for that would imply taht option would be required when selecting this stage
jobCode += AnnotatePeaks(
peakFile=peakFile,
genome=samples["genome"][sample],
motifFile=os.path.join(dataDir, "motifs", sampleName, "homerResults", "motif1.motif"),
outputBed=os.path.join(dataDir, "peaks", sampleName, sampleName + "_peaks.motifAnnotated.bed")
)
if args.stage in ["all", "peakanalysis"] and control:
jobCode += peakAnalysis(
inputBam=os.path.abspath(samples.ix[sample]['unmappedBam']),
peakFile=os.path.join(dataDir, "peaks", sampleName, sampleName + "_peaks.motifCentered.bed"),
plotsDir=os.path.join(resultsDir, 'plots'),
windowWidth=2000,
fragmentsize=1 if tagmented else 50, # change this to actual read length
genome=samples.ix[sample]['genome'],
n_clusters=5,
strand_specific=True,
duplicates=True
)
if args.stage in ["all", "tssanalysis"] and control:
jobCode += tssAnalysis(
inputBam=os.path.abspath(samples.ix[sample]['unmappedBam']),
tssFile=tssFiles[samples.ix[sample]['genome']],
plotsDir=os.path.join(resultsDir, 'plots'),
windowWidth=2000,
fragmentsize=1 if tagmented else 50, # change this to actual read length
genome=samples.ix[sample]['genome'],
n_clusters=5,
strand_specific=True,
duplicates=True
)
if args.stage in ["all", "frip"]:
jobCode += calculateFRiP(
inputBam=os.path.abspath(samples['unmappedBam'][sample]),
inputBed=peakFile,
output=os.path.join(resultsDir, sampleName + "_FRiP.txt"),
)
# if args.stage in ["all", "footprints"] and control and tagmented:
# jobCode += footprintAnalysis(
# os.path.join(dataDir, "peaks", sampleName, sampleName + "_peaks.motifCentered.bed"),
# os.path.join(dataDir, "peaks", sampleName, sampleName + "_peaks.motifAnnotated.bed")
# )
# finish job
jobCode += slurmFooter()
jobs[jobName] = jobCode
# Submit jobs to slurm
for jobName, jobCode in jobs.items():
# Output file name
jobFile = os.path.join(projectDir, "runs", jobName + ".sh")
with open(jobFile, 'w') as handle:
handle.write(textwrap.dedent(jobCode))
# Submit to slurm
if not args.dry_run:
logger.info("Submitting jobs to slurm")
status = slurmSubmitJob(jobFile)
if status != 0:
logger.error("Slurm job '%s' not successfull" % jobFile)
sys.exit(1)
logger.debug("Project '%s'submission finished successfully." % args.project_name)
# write original annotation sheet to project folder
samples.to_csv(
os.path.join(projectDir, args.project_name + ".replicates.annotation_sheet.csv"),
index=False
)
logger.debug("Finished comparison")
return (logger,
os.path.join(os.getcwd(), args.project_name + ".log"),
os.path.join(projectDir, "runs", args.project_name + ".log"))
def compare(args, logger):
logger.info("Starting sample comparison.")
logger.debug("Checking project directories exist and creating if not.")
htmlDir, projectDir, dataDir, resultsDir, urlRoot = checkProjectDirs(args, logger)
# Paths to static files on the cluster
# Other static info
# Parse sample information
args.csv = os.path.abspath(args.csv)
# check if exists and is a file
if not os.path.isfile(args.csv):
logger.error("Sample annotation '%s' does not exist, or user has no read access." % args.csv)
sys.exit(1)
# read in
samples = pd.read_csv(args.csv)
# TODO:
# Perform checks on the variables given
# (e.g. columns existing, bams/peaks existing)
# start pipeline
projectName = string.join([args.project_name, time.strftime("%Y%m%d-%H%M%S")], sep="_")
# track jobs to submit
jobs = dict()
# diffBind
if args.stage in ["all", "diffbind"]:
# Submit separate jobs for each genome
genome_groups = samples.groupby(["genome"]).groups
for genome in genome_groups.keys():
# Separate comparison per IPed factor
df = samples.ix[genome_groups[genome]]
IP_groups = df.groupby(["ip"]).groups
for IP in IP_groups.keys():
if IP.upper() in ["INPUT", "IGG"]: # skip groups with control
continue
jobName = projectName + "_" + "diffBind_{0}_{1}".format(genome, IP)
diffBindSheetFile = os.path.join(projectDir, "runs", jobName + ".csv")
print(diffBindSheetFile)
# make diffBind csv file, save it
empty = makeDiffBindSheet(
samples,
samples.ix[IP_groups[IP]],
os.path.join(dataDir, "peaks"),
diffBindSheetFile
)
# ^^ returns Bool for empty diffBind sheet
if not empty:
# create job
jobCode = slurmHeader(
jobName=jobName,
output=os.path.join(projectDir, "runs", jobName + ".slurm.log"),
queue=args.queue,
time=args.time,
cpusPerTask=args.cpus,
memPerCpu=args.mem,
userMail=args.user_mail
)
jobCode += diffBind(
inputCSV=diffBindSheetFile,
jobName=jobName,
plotsDir=os.path.join(resultsDir, 'plots')
)
jobCode += slurmFooter()
jobs[jobName] = jobCode
# Submit job for all samples together (per genome)
if args.stage in ["all", "correlations"]:
# Submit separate jobs for each genome
genome_groups = samples.groupby(["genome"]).groups
for genome in genome_groups.keys():
jobName = projectName + "_" + genome + "_sample_correlations"
# Separate comparison per IPed factor
df = samples.ix[genome_groups[genome]]
jobCode = slurmHeader(
jobName=jobName,
output=os.path.join(projectDir, "runs", jobName + ".slurm.log"),
queue=args.queue,
time=args.time,
cpusPerTask=args.cpus,
memPerCpu=args.mem,
userMail=args.user_mail
)
jobCode += plotCorrelations(
inputCoverage=[os.path.join(dataDir, "coverage", sampleName + ".cov") for sampleName in list(df['sampleName'])],
plotsDir=os.path.join(resultsDir, 'plots')
)
jobCode += slurmFooter()
jobs[jobName] = jobCode
# Submit jobs to slurm
for jobName, jobCode in jobs.items():
# Output file name
jobFile = os.path.join(projectDir, "runs", jobName + ".sh")
with open(jobFile, 'w') as handle:
handle.write(textwrap.dedent(jobCode))
# Submit to slurm
if not args.dry_run:
logger.info("Submitting jobs to slurm")
status = slurmSubmitJob(jobFile)
if status != 0:
logger.error("Slurm job '%s' not successfull" % jobFile)
sys.exit(1)
logger.debug("Project '%s'submission finished successfully." % args.project_name)
logger.debug("Finished comparison")
return (logger,
os.path.join(os.getcwd(), args.project_name + ".log"),
os.path.join(projectDir, "runs", args.project_name + ".log"))
def getReadType(bamFile, n=10):
"""
Gets the read type (single, paired) and length of bam file.
Returns tuple of (readType=string, readLength=int).
"""
from collections import Counter
p = subprocess.Popen(['samtools', 'view', bamFile], stdout=subprocess.PIPE)
# Count paired alignments
paired = 0
readLength = Counter()
while n > 0:
line = p.stdout.next().split("\t")
flag = int(line[1])
readLength[len(line[9])] += 1
if 1 & flag: # check decimal flag contains 1 (paired)
paired += 1
n -= 1
p.kill()
# Get most abundant read readLength
readLength = sorted(readLength)[-1]
# If at least half is paired, return True
if paired > (n / 2):
return ("PE", readLength)
else:
return ("SE", readLength)
def parseBowtieStats(statsFile):
"""
Parses Bowtie2 stats file, returns series with values.
"""
stats = pd.Series(index=["readCount", "unpaired", "unaligned", "unique", "multiple", "alignmentRate"])
with open(statsFile) as handle:
content = handle.readlines() # list of strings per line
# total reads
try:
line = [i for i in range(len(content)) if " reads; of these:" in content[i]][0]
stats["readCount"] = re.sub("\D.*", "", content[line])
line = [i for i in range(len(content)) if "were unpaired; of these:" in content[i]][0]
stats["unpaired"] = re.sub("\D", "", re.sub("\(.*", "", content[line]))
line = [i for i in range(len(content)) if "aligned 0 times" in content[i]][0]
stats["unaligned"] = re.sub("\D", "", re.sub("\(.*", "", content[line]))
line = [i for i in range(len(content)) if "aligned exactly 1 time" in content[i]][0]
stats["unique"] = re.sub("\D", "", re.sub("\(.*", "", content[line]))
line = [i for i in range(len(content)) if "aligned >1 times" in content[i]][0]
stats["multiple"] = re.sub("\D", "", re.sub("\(.*", "", content[line]))
line = [i for i in range(len(content)) if "overall alignment rate" in content[i]][0]
stats["alignmentRate"] = re.sub("\D.*", "", content[line])
except IndexError:
pass
return stats
def parseMACSModel(modelFile):
"""
Parses Bowtie2 stats file, returns series with values.
"""
out = dict()
with open(modelFile) as handle:
content = handle.readlines() # list of strings per line
try:
patt = re.compile("\((.+)\)")
line = [i for i in range(len(content)) if "p <- c(" in content[i]][0]
out["p"] = [float(i) for i in patt.search(content[line]).group(1).split(",")]
line = [i for i in range(len(content)) if "m <- c(" in content[i]][0]
out["m"] = [float(i) for i in patt.search(content[line]).group(1).split(",")]
line = [i for i in range(len(content)) if "ycorr <- c(" in content[i]][0]
out["ycorr"] = [float(i) for i in patt.search(content[line]).group(1).split(",")]
line = [i for i in range(len(content)) if "xcorr <- c(" in content[i]][0]
out["xcorr"] = [float(i) for i in patt.search(content[line]).group(1).split(",")]
line = [i for i in range(len(content)) if "altd <- c(" in content[i]][0]
out["altd"] = [float(i) for i in patt.search(content[line]).group(1).split(",")]
except IndexError:
pass
return out
def parseQC(sampleName, qcFile):
"""
Parses QC table produced by phantompeakqualtools (spp) and adds to sample annotation sheet
sample quality metrics for each sample.
"""
series = pd.Series(index=["NSC", "RSC", "qualityTag"])
try:
with open(qcFile) as handle:
content = handle.readlines() # list of strings per line
except:
return series
try:
line = [i for i in range(len(content)) if str(sampleName) in content[i]][0]
attrs = content[line].strip().split("\t")
series["NSC"] = attrs[-3]
series["RSC"] = attrs[-2]
series["qualityTag"] = attrs[-1]
except IndexError:
pass
return series
def makeDiffBindSheet(samples, df, peaksDir, sheetFile):
"""
Prepares and saves a diffBind annotation sheet from a pandas.DataFrame annotation with standard columns.
"""
# Exclude samples without matched control
df = df[df["controlSampleName"].notnull()]
# Convert numerical fields to str
for col in ["numberCells", "technique", "treatment", "patient"]:
df[col] = df[col].apply(str)
# Merge some annotation fields
df.loc[:, "condition"] = df["numberCells"] + "_" + df["technique"]
df.loc[:, "treatment"] = df["treatment"] + "_" + df["patient"]
# Complete rest of annotation
df.loc[:, "ControlID"] = df['controlSampleName']
df.loc[:, "bamControl"] = [samples[samples['sampleName'] == ctrl]["unmappedBam"].tolist()[0] for ctrl in df['controlSampleName']]
# TODO:
# Change peak path according to narrow/broad
df.loc[:, "Peaks"] = [os.path.join(peaksDir, sampleName, sampleName + "_peaks.narrowPeak") for sampleName in df["sampleName"]]
df["PeakCaller"] = "macs"
# Filter columns used
df = df[["sampleName", "cellLine", "ip", "condition", "treatment",
"biologicalReplicate", "unmappedBam", "ControlID", "bamControl", "Peaks", "PeakCaller"]]
# Rename columns
df.columns = ["SampleID", "Tissue", "Factor", "Condition", "Treatment",
"Replicate", "bamReads", "ControlID", "bamControl", "Peaks", "PeakCaller"]
# If resulting table is not empty, write to disk
if not df.empty:
# save as csv
df.to_csv(sheetFile, index=False) # check if format complies with DiffBind
# Return bool of empty
return df.empty
def slurmHeader(jobName, output, queue="shortq", ntasks=1, time="10:00:00",
cpusPerTask=16, memPerCpu=2000, nodes=1, userMail=""):
command = """ #!/bin/bash
#SBATCH --partition={0}
#SBATCH --ntasks={1}
#SBATCH --time={2}
#SBATCH --cpus-per-task={3}
#SBATCH --mem-per-cpu={4}
#SBATCH --nodes={5}
#SBATCH --job-name={6}
#SBATCH --output={7}
#SBATCH --mail-type=end
#SBATCH --mail-user={8}
# Start running the job
hostname
date
""".format(queue, ntasks, time, cpusPerTask, memPerCpu,
nodes, jobName, output, userMail)
return command
def slurmFooter():
command = """
# Job end
date
"""
return command
def slurmSubmitJob(jobFile):
command = "sbatch %s" % jobFile
return os.system(command)
def removeFile(fileName):
command = """
# Removing file
rm {0}
""".format(fileName)
return command
def moveFile(old, new):
command = """
# Moving file
mv {0} {1}
""".format(old, new)
return command
def makeDir(directory):
command = """
# Removing file
mkdir -p {0}
""".format(directory)
return command
def mergeBams(inputBams, outputBam):
command = """
# Merging bam files from replicates
echo "Merging bam files from replicates"
java -Xmx4g -jar /cm/shared/apps/picard-tools/1.118/MergeSamFiles.jar \\
USE_THREADING=TRUE \\
{1} \\
OUTPUT={0}
""".format(outputBam, (" ".join(["INPUT=%s"] * len(inputBams))) % tuple(inputBams))
return command
def fastqc(inputBam, outputDir):
command = """
# Measuring sample quality with Fastqc
echo "Measuring sample quality with Fastqc"
module load java/jdk/1.7.0_65
module load FastQC/0.11.2
fastqc --noextract \\
--outdir {0} \\
{1}
""".format(outputDir, inputBam)
return command
def bam2fastq(inputBam, outputFastq, outputFastq2=None, unpairedFastq=None):
command = """
# Convert to Fastq format
echo "Converting to Fastq format"
java -Xmx4g -jar /cm/shared/apps/picard-tools/1.118/SamToFastq.jar \\
INPUT={0} \\
""".format(inputBam)
if outputFastq2 is None and unpairedFastq is None:
command += """FASTQ={0}
""".format(outputFastq)
else:
command += """FASTQ={0} \\
SECOND_END_FASTQ={1} \\
UNPAIRED_FASTQ={2}
""".format(outputFastq, outputFastq2, unpairedFastq)
return command
def trimmomatic(inputFastq1, outputFastq1, cpus, adapters, log,
inputFastq2=None, outputFastq1unpaired=None,
outputFastq2=None, outputFastq2unpaired=None):
PE = False if inputFastq2 is None else True
pe = "PE" if PE else "SE"
command = """
# Trimming adapters from sample
echo "Trimming adapters from sample"
module load trimmomatic/0.32
java -Xmx4g -jar `which trimmomatic-0.32.jar` """
command += """{0} \\
-threads {1} \\
-trimlog {2} \\
{3} \\
""".format(pe, cpus, log, inputFastq1)
if PE:
command += """\\
{0} \\
""".format(inputFastq2)
command += """\\
{0} \\
""".format(outputFastq1)
if PE:
command += """\\
{0} \\
{1} \\
{2} \\
""".format(outputFastq1unpaired,
outputFastq2, outputFastq2unpaired)
command += """\\
ILLUMINACLIP:{0}:1:40:15:8:true \\
LEADING:3 TRAILING:3 \\
SLIDINGWINDOW:4:10 \\
MINLEN:36
""".format(adapters)
return command
def skewer(inputFastq1, outputPrefix, cpus, adapters, inputFastq2=None):
PE = False if inputFastq2 is None else True
mode = "pe" if PE else "any"
command = """
# Trimming adapters from sample
echo "Trimming adapters from sample"
skewer --quiet \\
-t {0} \\
-m {1} \\
-x {2} \\
-o {3} \\
{4} """.format(cpus, mode, adapters, outputPrefix, inputFastq1)
if inputFastq2 is not None:
command += """\\
{0}
""".format(inputFastq2)
return command
def bowtie2Map(inputFastq1, outputBam, log, metrics, genomeIndex, maxInsert, cpus, inputFastq2=None):
"""
"""
outputBam = re.sub("\.bam$", "", outputBam)
# Admits 2000bp-long fragments (--maxins option)
command = """
# Mapping reads with Bowtie2
echo "Mapping reads with Bowtie2"
module load bowtie/2.2.3
module load samtools
bowtie2 --very-sensitive -p {0} \\
-x {1} \\
--met-file {2} \\
""".format(cpus, genomeIndex, metrics)
if inputFastq2 is None:
command += "{0} ".format(inputFastq1)
else:
command += """--maxins {0} -1 {1} \\
-2 {2} """.format(maxInsert, inputFastq1, inputFastq2)
command += """\\
2> {0} | \\
samtools view -S -b - | \\
samtools sort - {1}
""".format(log, outputBam)
return command
def shiftReads(inputBam, genome, outputBam):
outputBam = re.sub("\.bam$", "", outputBam)
# TODO:
# Implement read shifting with HTSeq or Cython
command = """
# Shifting read of tagmented sample
echo "Shifting read of tagmented sample"
module load samtools
module load python
samtools view -h {0} | \\
python {3}/lib/shift_reads.py {1} | \\
samtools view -S -b - | \\
samtools sort - {2}
""".format(inputBam, genome, outputBam,
os.path.abspath(os.path.dirname(os.path.realpath(__file__))))
return command
def markDuplicates(inputBam, outputBam, metricsFile, tempDir="."):
transientFile = re.sub("\.bam$", "", outputBam) + ".dups.nosort.bam"
outputBam = re.sub("\.bam$", "", outputBam)
command = """
# Marking duplicates with piccard
echo "Mark duplicates with piccard"
module load samtools
java -Xmx4g -jar /cm/shared/apps/picard-tools/1.118/MarkDuplicates.jar \\
INPUT={0} \\
OUTPUT={1} \\
METRICS_FILE={2} \\
VALIDATION_STRINGENCY=LENIENT \\
TMP_DIR={3}
# Sort bam file with marked duplicates
samtools sort {1} \\
{4}
if [[ -s {1} ]]
then
rm {1}
fi
""".format(inputBam, transientFile, metricsFile, tempDir, outputBam)
return command
def removeDuplicates(inputBam, outputBam, cpus=16):
command = """
# Removing duplicates with sambamba
echo "Remove duplicates with sambamba"
sambamba markdup -t {2} -r \\
{0} \\
{1}
""".format(inputBam, outputBam, cpus)
return command
def indexBam(inputBam):
command = """
# Indexing bamfile with samtools
echo "Indexing bamfile with samtools"
module load samtools
samtools index {0}
""".format(inputBam)
return command
def peakTools(inputBam, output, plot, cpus):
command = """
# Get sample QC metrics
echo "Getting sample QC metrics"
module load boost/1.57
module load gcc/4.8.2
module load openmpi/gcc/64/1.8.1
run_spp.R -rf -savp -savp={2} -s=0:5:500 \\
-c={0} -out={1}
""".format(inputBam, output, plot)
return command
def qc():
"""
$PRESEQ c_curve -B $PROJECTDIR/mapped/$SAMPLE_NAME.trimmed.bowtie2.sorted.shifted.dup.bam \
-o $PROJECTDIR/mapped/$SAMPLE_NAME_qc_c_curve.txt
$PRESEQ lc_extrap -e 1e8 -s 2e6 -B $PROJECTDIR/mapped/$SAMPLE_NAME.trimmed.bowtie2.sorted.shifted.dup.bam \
-o $PROJECTDIR/mapped/$SAMPLE_NAME_qc_lc_extrap.txt
"""
raise NotImplementedError("Function not implemented yet.")
def bamToUCSC(inputBam, outputBigWig, genomeSizes, genome, tagmented=False):
transientFile = os.path.abspath(re.sub("\.bigWig", "", outputBigWig))
if not tagmented:
# TODO:
# addjust fragment length dependent on read size and real fragment size
# (right now it asssumes 50bp reads with 180bp fragments)
command = """
# Making bigWig tracks from bam file
echo "making bigWig tracks from bam file"
module load bedtools
bedtools bamtobed -i {0} | \\
bedtools slop -i stdin -g {1} -s -l 0 -r 130 | \\
python {5}/lib/fix_bedfile_genome_boundaries.py {4} | \\
genomeCoverageBed -i stdin -bg -g {1} \\
> {2}.cov
bedGraphToBigWig {2}.cov \\
{1} \\
{3}
# remove cov file
if [[ -s {2}.cov ]]
then
rm {2}.cov
fi
chmod 755 {3}
""".format(inputBam, genomeSizes, transientFile, outputBigWig, genome,
os.path.abspath(os.path.dirname(os.path.realpath(__file__))))
else:
command = """
# Making bigWig tracks from bam file
echo "making bigWig tracks from bam file"
module load bedtools
bedtools bamtobed -i {0} |
genomeCoverageBed -5 -i stdin -bg -g {1} \\
> {2}.cov
bedGraphToBigWig {2}.cov \\
{1} \\
{3}
# remove cov file
if [[ -s {2}.cov ]]
then
rm {2}.cov
fi
chmod 755 {3}
""".format(inputBam, genomeSizes, transientFile, outputBigWig,
os.path.abspath(os.path.dirname(os.path.realpath(__file__))))
return command
def addTrackToHub(sampleName, trackURL, trackHub, colour, fivePrime=""):
command = """
# Adding track to TrackHub
echo "Adding track to TrackHub"
echo "track type=bigWig name='{0} {1}' description='{0} {1}' """.format(sampleName, fivePrime)
command += """height=32 visibility=full maxHeightPixels=32:32:25 bigDataUrl={0} color={1}" >> \\
{2}
chmod 755 {2}
""".format(trackURL, colour, trackHub)
return command
def linkToTrackHub(trackHubURL, fileName, genome):
db = "org" if genome == "hg19" else "db" # different database call for human
genome = "human" if genome == "hg19" else genome # change hg19 to human
html = """
<html>
<head>
<meta http-equiv="refresh" content="0; url=http://genome.ucsc.edu/cgi-bin/hgTracks?"""
html += """{db}={genome}&hgt.customText={trackHubURL}" />
</head>
</html>
""".format(trackHubURL=trackHubURL, genome=genome, db=db)
with open(fileName, 'w') as handle:
handle.write(textwrap.dedent(html))
def genomeWideCoverage(inputBam, genomeWindows, output):
command = """
# Calculate genome-wide coverage
echo "Calculating genome-wide coverage"
module load bedtools
bedtools coverage -abam -counts \\
-a {0} \\
-b {1} \\
> {2}
""".format(inputBam, genomeWindows, output)
return command
def calculateFRiP(inputBam, inputBed, output):
command = "cut -f 1,2,3 {0} ".format(inputBed)
command += "| bedtools coverage -counts "
command += "-abam {0} ".format(inputBam)
command += "-b - "
command += "| awk '{{sum+=$4}} END {{print sum}}' > {0}".format(output)
return command
def macs2CallPeaks(treatmentBam, controlBam, outputDir, sampleName, genome, broad=False, plot=True):
if genome == "hg19":
genome = "hs"
elif genome == "mm10":
genome = "mm"
elif genome == "dr7":
raise ValueError("Genome dr7 not yet supported for peak calling with MACS2.")
if not broad:
command = """
# Call peaks with MACS2
echo "Calling peaks with MACS2"
macs2 callpeak \\
-t {0} \\
-c {1} \\
--bw 200 \\
-g {2} -n {3} --outdir {4}
""".format(treatmentBam, controlBam, genome, sampleName, outputDir)
command = """
# Call peaks with MACS2
echo "Calling peaks with MACS2"
macs2 callpeak \\
-t {0} \\
-c {1} \\
--bw 200 \\
--fix-bimodal --extsize 200 \\
-g {2} -n {3} --outdir {4}
""".format(treatmentBam, controlBam, genome, sampleName, outputDir)
else:
# Parameter setting for broad factors according to Nature Protocols (2012)
# Vol.7 No.9 1728-1740 doi:10.1038/nprot.2012.101 Protocol (D) for H3K36me3
command = """
# Call peaks with MACS2
echo "Calling peaks with MACS2"
macs2 callpeak \\
-t {0} \\
-c {1} \\
--broad --nomodel --extsize 73 --pvalue 1e-3 \\
-g {2} -n {3} --outdir {4}
""".format(treatmentBam, controlBam, genome, sampleName, outputDir)
if plot and not broad:
command += """
# Plot MACS2 crosscorrelation
echo "Plotting MACS2 crosscorrelation"
module load R
Rscript {0}/{1}_model.r
mv {2}/{1}_model.pdf {0}/{1}_model.pdf
""".format(outputDir, sampleName, os.getcwd())
return command
def sppCallPeaks(treatmentBam, controlBam, treatmentName, controlName, outputDir, broad, cpus):
broad = "TRUE" if broad else "FALSE"
command = """
# Calling peaks with SPP
echo "calling peaks with SPP"
module load R
Rscript {6}/lib/spp_peak_calling.R \\
{0} \\
{1} \\
{2} \\
{3} \\
{4} \\
{5} \\
{6}
""".format(treatmentBam, controlBam, treatmentName, controlName, broad, cpus,
os.path.abspath(os.path.dirname(os.path.realpath(__file__))))
return command
def homerFindMotifs(peakFile, genome, outputDir, size=150, length="8,10,12,14,16", n_motifs=12):
command = """
# Find motifs with Homer
echo "finding motifs with Homer"
findMotifsGenome.pl {0} \\
{1} \\
{2} \\
-mask -size {3} -len {4} -S {5}
""".format(peakFile, genome, outputDir, size, length, n_motifs)
return command
def AnnotatePeaks(peakFile, genome, motifFile, outputBed):
command = """
# Annotate peaks with motif score
echo "annotating peaks with motif score"
annotatePeaks.pl {0} \\
{1} \\
-mask -mscore -m {2} | \\
tail -n +2 | cut -f 1,5,22 \\
> {3}
""".format(peakFile, genome, motifFile, outputBed)
return command
def centerPeaksOnMotifs(peakFile, genome, windowWidth, motifFile, outputBed):
command = """
# Center peaks on motif
echo "centering peaks on motif"
annotatePeaks.pl {0} \\
{1} \\
-size {2} -center {3} | \\
awk -v OFS='\\t' '{{print $2, $3, $4, $1, $6, $5}}' | \\
awk -v OFS='\\t' -F '\t' '{{ gsub("0", "+", $6) ; gsub("1", "-", $6) ; print }}' | \\
python {5}/lib/fix_bedfile_genome_boundaries.py {1} | \\
sortBed > {4}
""".format(peakFile, genome, windowWidth, motifFile, outputBed,
os.path.abspath(os.path.dirname(os.path.realpath(__file__))))
return command
def peakAnalysis(inputBam, peakFile, plotsDir, windowWidth, fragmentsize,
genome, n_clusters, strand_specific, duplicates):
command = """
# Analyse peak profiles
echo "Analysing peak profiles"
python {0}/lib/peaks_analysis.py {1} \\
{2} \\
{3} \\
--window-width {4} \\
--fragment-size {5} \\
--genome {6} \\
--n_clusters {7} """.format(
os.path.abspath(os.path.dirname(os.path.realpath(__file__))),
inputBam, peakFile, plotsDir, windowWidth, fragmentsize, genome, n_clusters
)
if strand_specific:
command += "--strand-specific "
if duplicates:
command += "--duplicates\n"
return command
def tssAnalysis(inputBam, tssFile, plotsDir, windowWidth, fragmentsize, genome,
n_clusters, strand_specific, duplicates):
command = """
# Analyse signal over TSSs
echo "Analysing signal over TSSs"
python {0}/lib/tss_analysis.py {1} \\
{2} \\
{3} \\
--window-width {4} \\
--fragment-size {5} \\
--genome {6} \\
--n_clusters {7} """.format(
os.path.abspath(os.path.dirname(os.path.realpath(__file__))),
inputBam, tssFile, plotsDir, windowWidth, fragmentsize, genome, n_clusters
)
if strand_specific:
command += "--strand-specific "
if duplicates:
command += "--duplicates "
return command
def footprintAnalysis():
raise NotImplementedError("Function not implemented yet.")
def plotCorrelations(inputCoverage, plotsDir):
command = """
# Plot correlations
echo "plotting correlations"
python {2}/lib/correlations.py \\
{0} \\
{1}
""".format(plotsDir,
" ".join(["%s"] * len(inputCoverage)) % tuple(inputCoverage),
os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
)
return command
def diffBind(inputCSV, jobName, plotsDir):
command = """
# Detect differential binding with diffBind
echo "Detecting differential binding with diffBind"
module load R
Rscript {3}/lib/diffBind_analysis.R \\
{0} \\
{1} \\
{2} \\
""".format(inputCSV, jobName, plotsDir,
os.path.abspath(os.path.dirname(os.path.realpath(__file__))))
return command
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
print("Program canceled by user!")
sys.exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment