TTTPOB/get_sequence_location_on_genome.py

## get_sequence_location_on_genome.py
#!/usr/bin/env python3
import re
from multiprocessing import Pool
from Bio import SeqIO
from Bio.Seq import Seq
import click
from pathlib import Path
import pandas as pd
import numpy as np


def get_motif_regex(motif: str):
    motif_length = len(motif)
    first_char = motif[0]
    other_part = motif[1:]
    regex = f"{first_char}(?={other_part})"
    regex = (
        regex.replace("N", ".")
        .replace("R", "[AG]")
        .replace("Y", "[CT]")
        .replace("S", "[GC]")
    )
    return re.compile(regex, re.IGNORECASE)


def find_motif_on_chromsome(sequence: str, motif: re.Pattern):
    # use finditer to find all motifs in the genome
    locations = [m.start() for m in motif.finditer(sequence)]
    return locations


def get_end_location(motif_location: list, motif_length: int):
    end_location = [start + motif_length for start in motif_location]
    return end_location


def dict_to_dataframe(
    all_seqnames, pos_start: dict, neg_start: dict, pos_end: dict, neg_end: dict
):
    chr_df = {}
    for seqname in all_seqnames:
        p_start = pos_start[seqname]
        n_start = neg_start[seqname]
        start = p_start + n_start
        p_end = pos_end[seqname]
        n_end = neg_end[seqname]
        end = p_end + n_end
        chr_df[seqname] = pd.DataFrame(
            {"seqnames": seqname, "start": start, "end": end}
        )
    all_df = pd.concat(chr_df)
    return all_df

def find_motif_param_generator(genome_dict, regex):
    for seqname, sequence in genome_dict.items():
        sequence = str(sequence.seq)
        yield sequence, regex


def main(fasta_path: str, motif: str, bedfile: str):
    pos_regex = get_motif_regex(motif)
    neg_motif = Seq(motif).reverse_complement()
    neg_motif = str(neg_motif)
    neg_regex = get_motif_regex(neg_motif)
    genome = SeqIO.to_dict(SeqIO.parse(fasta_path, "fasta"))
    all_seqnames = list(genome.keys())

    with Pool(8) as p:
        pos_start = p.starmap(
            find_motif_on_chromsome,
            find_motif_param_generator(genome, pos_regex),
        )
        neg_start = p.starmap(
            find_motif_on_chromsome,
            find_motif_param_generator(genome, neg_regex),
        )
    pos_start_dict = {
        seqname: pos_start for seqname, pos_start in zip(all_seqnames, pos_start)
    }
    neg_start_dict = {
        seqname: neg_start for seqname, neg_start in zip(all_seqnames, neg_start)
    }


    pos_end = {
        seqname: get_end_location(start, len(motif))
        for seqname, start in pos_start_dict.items()
    }
    neg_end = {
        seqname: get_end_location(start, len(motif))
        for seqname, start in neg_start_dict.items()
    }
    all_df = dict_to_dataframe(all_seqnames, pos_start_dict, neg_start_dict, pos_end, neg_end)
    all_df = all_df.astype({"start": "int64", "end": "int64"})
    target_dir = Path(bedfile).parent
    target_dir.mkdir(parents=True, exist_ok=True)
    all_df.to_csv(bedfile, sep="\t", index=False, header=False)


@click.command()
@click.option("--fasta", help="path to fasta file")
@click.option("--motif", help="sequence to search for")
@click.option("--bedfile", help="path to bedfile")
def cli(fasta, motif, bedfile):
    main(fasta, motif, bedfile)


if __name__ == "__main__":
    cli()
	#!/usr/bin/env python3
	import re
	from multiprocessing import Pool
	from Bio import SeqIO
	from Bio.Seq import Seq
	import click
	from pathlib import Path
	import pandas as pd
	import numpy as np


	def get_motif_regex(motif: str):
	motif_length = len(motif)
	first_char = motif[0]
	other_part = motif[1:]
	regex = f"{first_char}(?={other_part})"
	regex = (
	regex.replace("N", ".")
	.replace("R", "[AG]")
	.replace("Y", "[CT]")
	.replace("S", "[GC]")
	)
	return re.compile(regex, re.IGNORECASE)


	def find_motif_on_chromsome(sequence: str, motif: re.Pattern):
	# use finditer to find all motifs in the genome
	locations = [m.start() for m in motif.finditer(sequence)]
	return locations


	def get_end_location(motif_location: list, motif_length: int):
	end_location = [start + motif_length for start in motif_location]
	return end_location


	def dict_to_dataframe(
	all_seqnames, pos_start: dict, neg_start: dict, pos_end: dict, neg_end: dict
	):
	chr_df = {}
	for seqname in all_seqnames:
	p_start = pos_start[seqname]
	n_start = neg_start[seqname]
	start = p_start + n_start
	p_end = pos_end[seqname]
	n_end = neg_end[seqname]
	end = p_end + n_end
	chr_df[seqname] = pd.DataFrame(
	{"seqnames": seqname, "start": start, "end": end}
	)
	all_df = pd.concat(chr_df)
	return all_df

	def find_motif_param_generator(genome_dict, regex):
	for seqname, sequence in genome_dict.items():
	sequence = str(sequence.seq)
	yield sequence, regex


	def main(fasta_path: str, motif: str, bedfile: str):
	pos_regex = get_motif_regex(motif)
	neg_motif = Seq(motif).reverse_complement()
	neg_motif = str(neg_motif)
	neg_regex = get_motif_regex(neg_motif)
	genome = SeqIO.to_dict(SeqIO.parse(fasta_path, "fasta"))
	all_seqnames = list(genome.keys())

	with Pool(8) as p:
	pos_start = p.starmap(
	find_motif_on_chromsome,
	find_motif_param_generator(genome, pos_regex),
	)
	neg_start = p.starmap(
	find_motif_on_chromsome,
	find_motif_param_generator(genome, neg_regex),
	)
	pos_start_dict = {
	seqname: pos_start for seqname, pos_start in zip(all_seqnames, pos_start)
	}
	neg_start_dict = {
	seqname: neg_start for seqname, neg_start in zip(all_seqnames, neg_start)
	}


	pos_end = {
	seqname: get_end_location(start, len(motif))
	for seqname, start in pos_start_dict.items()
	}
	neg_end = {
	seqname: get_end_location(start, len(motif))
	for seqname, start in neg_start_dict.items()
	}
	all_df = dict_to_dataframe(all_seqnames, pos_start_dict, neg_start_dict, pos_end, neg_end)
	all_df = all_df.astype({"start": "int64", "end": "int64"})
	target_dir = Path(bedfile).parent
	target_dir.mkdir(parents=True, exist_ok=True)
	all_df.to_csv(bedfile, sep="\t", index=False, header=False)


	@click.command()
	@click.option("--fasta", help="path to fasta file")
	@click.option("--motif", help="sequence to search for")
	@click.option("--bedfile", help="path to bedfile")
	def cli(fasta, motif, bedfile):
	main(fasta, motif, bedfile)


	if __name__ == "__main__":
	cli()