Skip to content

Instantly share code, notes, and snippets.

@TTTPOB
Created February 3, 2022 15:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TTTPOB/5d479537e6fab43064df4ddabbc7884a to your computer and use it in GitHub Desktop.
Save TTTPOB/5d479537e6fab43064df4ddabbc7884a to your computer and use it in GitHub Desktop.
get sequence location on genome, advantage of this script is that it will recognize GGGG as 2 NGG sites.
#!/usr/bin/env python3
import re
from multiprocessing import Pool
from Bio import SeqIO
from Bio.Seq import Seq
import click
from pathlib import Path
import pandas as pd
import numpy as np
def get_motif_regex(motif: str):
motif_length = len(motif)
first_char = motif[0]
other_part = motif[1:]
regex = f"{first_char}(?={other_part})"
regex = (
regex.replace("N", ".")
.replace("R", "[AG]")
.replace("Y", "[CT]")
.replace("S", "[GC]")
)
return re.compile(regex, re.IGNORECASE)
def find_motif_on_chromsome(sequence: str, motif: re.Pattern):
# use finditer to find all motifs in the genome
locations = [m.start() for m in motif.finditer(sequence)]
return locations
def get_end_location(motif_location: list, motif_length: int):
end_location = [start + motif_length for start in motif_location]
return end_location
def dict_to_dataframe(
all_seqnames, pos_start: dict, neg_start: dict, pos_end: dict, neg_end: dict
):
chr_df = {}
for seqname in all_seqnames:
p_start = pos_start[seqname]
n_start = neg_start[seqname]
start = p_start + n_start
p_end = pos_end[seqname]
n_end = neg_end[seqname]
end = p_end + n_end
chr_df[seqname] = pd.DataFrame(
{"seqnames": seqname, "start": start, "end": end}
)
all_df = pd.concat(chr_df)
return all_df
def find_motif_param_generator(genome_dict, regex):
for seqname, sequence in genome_dict.items():
sequence = str(sequence.seq)
yield sequence, regex
def main(fasta_path: str, motif: str, bedfile: str):
pos_regex = get_motif_regex(motif)
neg_motif = Seq(motif).reverse_complement()
neg_motif = str(neg_motif)
neg_regex = get_motif_regex(neg_motif)
genome = SeqIO.to_dict(SeqIO.parse(fasta_path, "fasta"))
all_seqnames = list(genome.keys())
with Pool(8) as p:
pos_start = p.starmap(
find_motif_on_chromsome,
find_motif_param_generator(genome, pos_regex),
)
neg_start = p.starmap(
find_motif_on_chromsome,
find_motif_param_generator(genome, neg_regex),
)
pos_start_dict = {
seqname: pos_start for seqname, pos_start in zip(all_seqnames, pos_start)
}
neg_start_dict = {
seqname: neg_start for seqname, neg_start in zip(all_seqnames, neg_start)
}
pos_end = {
seqname: get_end_location(start, len(motif))
for seqname, start in pos_start_dict.items()
}
neg_end = {
seqname: get_end_location(start, len(motif))
for seqname, start in neg_start_dict.items()
}
all_df = dict_to_dataframe(all_seqnames, pos_start_dict, neg_start_dict, pos_end, neg_end)
all_df = all_df.astype({"start": "int64", "end": "int64"})
target_dir = Path(bedfile).parent
target_dir.mkdir(parents=True, exist_ok=True)
all_df.to_csv(bedfile, sep="\t", index=False, header=False)
@click.command()
@click.option("--fasta", help="path to fasta file")
@click.option("--motif", help="sequence to search for")
@click.option("--bedfile", help="path to bedfile")
def cli(fasta, motif, bedfile):
main(fasta, motif, bedfile)
if __name__ == "__main__":
cli()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment