Created
March 17, 2017 16:45
-
-
Save ecwheele/5b6ff6625bcfca1534b5f11a58648840 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env python | |
import pybedtools | |
import subprocess | |
from argparse import ArgumentParser | |
import os | |
#Todo: Need to add a check to see if the shuffled bed file already exists | |
def get_shuffled_region(bed_file, bed_file_to_shuffle_within): | |
""" | |
Makes a shuffled file | |
:param bed_file: Bed file of peaks to shuffle | |
:param bed_file_to_shuffle_within: Bed file of regions to include in the shuffling. | |
This is typicalls a file that matches the genic region of peaks (exons, 3'UTRs, etc.) | |
:return: a shuffled bedtool object | |
""" | |
bed_file_tool = pybedtools.BedTool(bed_file) | |
shuffled = bed_file_tool.shuffle(genome="hg19", | |
incl=bed_file_to_shuffle_within, | |
noOverlapping=True) | |
return shuffled | |
def fix_strand_on_shuffled_bedtool(bed_file, shuffled_bedtool, bed_file_to_shuffle_within): | |
""" | |
Bedtools shuffle keeps the strand from the original file, but we need to take the strand | |
from the reference file. So we fix that here. | |
:param bed_file: original bed file to shuffle from. This function will save a shuffled file | |
with the same name but the extension .shuffled.bed | |
:param shuffled_bedtool: Bedtool that has been shuffled | |
:param bed_file_to_shuffle_within: Bed file that was used as the -incl flag in the shuffle | |
:return: a fixed bedtool object with the correct strand | |
""" | |
bed_tool_to_shuffle_within = pybedtools.BedTool(bed_file_to_shuffle_within) | |
intersected = shuffled_bedtool.sort().intersect(bed_tool_to_shuffle_within.sort(), wao=True) | |
shuffled_tool_field_count = shuffled_bedtool.field_count() | |
regions_tool_field_count = bed_tool_to_shuffle_within.field_count() | |
total_header_size = shuffled_tool_field_count + regions_tool_field_count + 1 | |
intersected = intersected.to_dataframe(names=range(total_header_size)) | |
intersected_grouped = intersected.groupby([0, 1, 2, 3]).first() | |
intersected[5] = intersected[shuffled_tool_field_count + 5] | |
values = intersected_grouped.apply(lambda x: "\t".join([str(item) for item in list(x.name) + | |
list(x[:shuffled_tool_field_count - 4])]), axis=1).values | |
fixed_shuffled_bedtool = pybedtools.BedTool("\n".join(values), from_string=True) | |
fixed_shuffled_bedtool.saveas(bed_file.replace(".bed", ".shuffled.bed")) | |
def run_homer(foreground_bed, background_bed, output_folder): | |
""" | |
Run homer to find motifs | |
:param foreground_bed: Bed file of peaks to search for motifs (make sure it contains strand) | |
:param background_bed: Bed file of background list of regions (make sure it contains strand) | |
:param output_folder: Folder to create with HOMER results | |
:return: None, HOMER folder will be saved | |
""" | |
p = subprocess.Popen("findMotifsGenome.pl {} hg19 {} -bg {} -len 5,6,7 -rna".format( | |
str(foreground_bed), str(output_folder), str(background_bed)), | |
shell=True, stdout=subprocess.PIPE) | |
p.communicate() | |
def make_background_and_run_homer(bed_file, bed_file_to_shuffle_within, output_folder): | |
""" | |
Main function to get background and run homer | |
:param bed_file: Bed file of peaks | |
:param bed_file_to_shuffle_within: Bed file of regions to include in the shuffling. | |
This is typicalls a file that matches the genic region of peaks (exons, 3'UTRs, etc.) | |
:param output_folder: Output folder to save HOMER results | |
:return: | |
""" | |
shuffled = get_shuffled_region(bed_file=bed_file, | |
bed_file_to_shuffle_within=bed_file_to_shuffle_within) | |
fix_strand_on_shuffled_bedtool(bed_file=bed_file, | |
shuffled_bedtool=shuffled, | |
bed_file_to_shuffle_within=bed_file_to_shuffle_within) | |
run_homer(foreground_bed=bed_file, | |
background_bed=bed_file.replace(".bed", ".shuffled.bed"), | |
output_folder=output_folder) | |
def main(): | |
parser = ArgumentParser() | |
parser.add_argument( | |
"--bed_file", | |
dest="bed_file", | |
help="bed file of peaks to search for motifs", | |
default=None | |
) | |
parser.add_argument( | |
"--bed_file_to_shuffle_within", | |
dest="bed_file_to_shuffle_within", | |
help="Bed file of regions to include in the shuffling. \ | |
This is typicalls a file that matches the genic region of peaks (exons, 3'UTRs, etc.)", | |
default=None | |
) | |
parser.add_argument( | |
"--output_folder", | |
dest="output_folder", | |
help="Output folder to save HOMER results", | |
default=None | |
) | |
opts = parser.parse_args() | |
make_background_and_run_homer(bed_file=opts.bed_file, | |
bed_file_to_shuffle_within=opts.bed_file_to_shuffle_within, | |
output_folder=opts.output_folder) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment