Created
February 3, 2022 15:49
-
-
Save TTTPOB/5d479537e6fab43064df4ddabbc7884a to your computer and use it in GitHub Desktop.
get sequence location on genome, advantage of this script is that it will recognize GGGG as 2 NGG sites.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import re | |
from multiprocessing import Pool | |
from Bio import SeqIO | |
from Bio.Seq import Seq | |
import click | |
from pathlib import Path | |
import pandas as pd | |
import numpy as np | |
def get_motif_regex(motif: str): | |
motif_length = len(motif) | |
first_char = motif[0] | |
other_part = motif[1:] | |
regex = f"{first_char}(?={other_part})" | |
regex = ( | |
regex.replace("N", ".") | |
.replace("R", "[AG]") | |
.replace("Y", "[CT]") | |
.replace("S", "[GC]") | |
) | |
return re.compile(regex, re.IGNORECASE) | |
def find_motif_on_chromsome(sequence: str, motif: re.Pattern): | |
# use finditer to find all motifs in the genome | |
locations = [m.start() for m in motif.finditer(sequence)] | |
return locations | |
def get_end_location(motif_location: list, motif_length: int): | |
end_location = [start + motif_length for start in motif_location] | |
return end_location | |
def dict_to_dataframe( | |
all_seqnames, pos_start: dict, neg_start: dict, pos_end: dict, neg_end: dict | |
): | |
chr_df = {} | |
for seqname in all_seqnames: | |
p_start = pos_start[seqname] | |
n_start = neg_start[seqname] | |
start = p_start + n_start | |
p_end = pos_end[seqname] | |
n_end = neg_end[seqname] | |
end = p_end + n_end | |
chr_df[seqname] = pd.DataFrame( | |
{"seqnames": seqname, "start": start, "end": end} | |
) | |
all_df = pd.concat(chr_df) | |
return all_df | |
def find_motif_param_generator(genome_dict, regex): | |
for seqname, sequence in genome_dict.items(): | |
sequence = str(sequence.seq) | |
yield sequence, regex | |
def main(fasta_path: str, motif: str, bedfile: str): | |
pos_regex = get_motif_regex(motif) | |
neg_motif = Seq(motif).reverse_complement() | |
neg_motif = str(neg_motif) | |
neg_regex = get_motif_regex(neg_motif) | |
genome = SeqIO.to_dict(SeqIO.parse(fasta_path, "fasta")) | |
all_seqnames = list(genome.keys()) | |
with Pool(8) as p: | |
pos_start = p.starmap( | |
find_motif_on_chromsome, | |
find_motif_param_generator(genome, pos_regex), | |
) | |
neg_start = p.starmap( | |
find_motif_on_chromsome, | |
find_motif_param_generator(genome, neg_regex), | |
) | |
pos_start_dict = { | |
seqname: pos_start for seqname, pos_start in zip(all_seqnames, pos_start) | |
} | |
neg_start_dict = { | |
seqname: neg_start for seqname, neg_start in zip(all_seqnames, neg_start) | |
} | |
pos_end = { | |
seqname: get_end_location(start, len(motif)) | |
for seqname, start in pos_start_dict.items() | |
} | |
neg_end = { | |
seqname: get_end_location(start, len(motif)) | |
for seqname, start in neg_start_dict.items() | |
} | |
all_df = dict_to_dataframe(all_seqnames, pos_start_dict, neg_start_dict, pos_end, neg_end) | |
all_df = all_df.astype({"start": "int64", "end": "int64"}) | |
target_dir = Path(bedfile).parent | |
target_dir.mkdir(parents=True, exist_ok=True) | |
all_df.to_csv(bedfile, sep="\t", index=False, header=False) | |
@click.command() | |
@click.option("--fasta", help="path to fasta file") | |
@click.option("--motif", help="sequence to search for") | |
@click.option("--bedfile", help="path to bedfile") | |
def cli(fasta, motif, bedfile): | |
main(fasta, motif, bedfile) | |
if __name__ == "__main__": | |
cli() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment