Skip to content

Instantly share code, notes, and snippets.

@ecwheele
Created March 17, 2017 16:45
Show Gist options
  • Save ecwheele/5b6ff6625bcfca1534b5f11a58648840 to your computer and use it in GitHub Desktop.
Save ecwheele/5b6ff6625bcfca1534b5f11a58648840 to your computer and use it in GitHub Desktop.
#!/bin/env python
import pybedtools
import subprocess
from argparse import ArgumentParser
import os
#Todo: Need to add a check to see if the shuffled bed file already exists
def get_shuffled_region(bed_file, bed_file_to_shuffle_within):
"""
Makes a shuffled file
:param bed_file: Bed file of peaks to shuffle
:param bed_file_to_shuffle_within: Bed file of regions to include in the shuffling.
This is typicalls a file that matches the genic region of peaks (exons, 3'UTRs, etc.)
:return: a shuffled bedtool object
"""
bed_file_tool = pybedtools.BedTool(bed_file)
shuffled = bed_file_tool.shuffle(genome="hg19",
incl=bed_file_to_shuffle_within,
noOverlapping=True)
return shuffled
def fix_strand_on_shuffled_bedtool(bed_file, shuffled_bedtool, bed_file_to_shuffle_within):
"""
Bedtools shuffle keeps the strand from the original file, but we need to take the strand
from the reference file. So we fix that here.
:param bed_file: original bed file to shuffle from. This function will save a shuffled file
with the same name but the extension .shuffled.bed
:param shuffled_bedtool: Bedtool that has been shuffled
:param bed_file_to_shuffle_within: Bed file that was used as the -incl flag in the shuffle
:return: a fixed bedtool object with the correct strand
"""
bed_tool_to_shuffle_within = pybedtools.BedTool(bed_file_to_shuffle_within)
intersected = shuffled_bedtool.sort().intersect(bed_tool_to_shuffle_within.sort(), wao=True)
shuffled_tool_field_count = shuffled_bedtool.field_count()
regions_tool_field_count = bed_tool_to_shuffle_within.field_count()
total_header_size = shuffled_tool_field_count + regions_tool_field_count + 1
intersected = intersected.to_dataframe(names=range(total_header_size))
intersected_grouped = intersected.groupby([0, 1, 2, 3]).first()
intersected[5] = intersected[shuffled_tool_field_count + 5]
values = intersected_grouped.apply(lambda x: "\t".join([str(item) for item in list(x.name) +
list(x[:shuffled_tool_field_count - 4])]), axis=1).values
fixed_shuffled_bedtool = pybedtools.BedTool("\n".join(values), from_string=True)
fixed_shuffled_bedtool.saveas(bed_file.replace(".bed", ".shuffled.bed"))
def run_homer(foreground_bed, background_bed, output_folder):
"""
Run homer to find motifs
:param foreground_bed: Bed file of peaks to search for motifs (make sure it contains strand)
:param background_bed: Bed file of background list of regions (make sure it contains strand)
:param output_folder: Folder to create with HOMER results
:return: None, HOMER folder will be saved
"""
p = subprocess.Popen("findMotifsGenome.pl {} hg19 {} -bg {} -len 5,6,7 -rna".format(
str(foreground_bed), str(output_folder), str(background_bed)),
shell=True, stdout=subprocess.PIPE)
p.communicate()
def make_background_and_run_homer(bed_file, bed_file_to_shuffle_within, output_folder):
"""
Main function to get background and run homer
:param bed_file: Bed file of peaks
:param bed_file_to_shuffle_within: Bed file of regions to include in the shuffling.
This is typicalls a file that matches the genic region of peaks (exons, 3'UTRs, etc.)
:param output_folder: Output folder to save HOMER results
:return:
"""
shuffled = get_shuffled_region(bed_file=bed_file,
bed_file_to_shuffle_within=bed_file_to_shuffle_within)
fix_strand_on_shuffled_bedtool(bed_file=bed_file,
shuffled_bedtool=shuffled,
bed_file_to_shuffle_within=bed_file_to_shuffle_within)
run_homer(foreground_bed=bed_file,
background_bed=bed_file.replace(".bed", ".shuffled.bed"),
output_folder=output_folder)
def main():
parser = ArgumentParser()
parser.add_argument(
"--bed_file",
dest="bed_file",
help="bed file of peaks to search for motifs",
default=None
)
parser.add_argument(
"--bed_file_to_shuffle_within",
dest="bed_file_to_shuffle_within",
help="Bed file of regions to include in the shuffling. \
This is typicalls a file that matches the genic region of peaks (exons, 3'UTRs, etc.)",
default=None
)
parser.add_argument(
"--output_folder",
dest="output_folder",
help="Output folder to save HOMER results",
default=None
)
opts = parser.parse_args()
make_background_and_run_homer(bed_file=opts.bed_file,
bed_file_to_shuffle_within=opts.bed_file_to_shuffle_within,
output_folder=opts.output_folder)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment