Created
August 19, 2022 16:37
-
-
Save samehkamaleldin/b22245d92cf2c8e77f12544c0a03ddfb to your computer and use it in GitHub Desktop.
GR Scan
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter | |
from typing import Dict | |
def main(): | |
# ----------------- Configuration ----------------- # | |
fasta_filepath = "./data/Hsym_primary_v1.0.aa.fasta" | |
window_size = 16 | |
slide_size = 4 | |
aa_min_occurrences = {'R': 5, 'G': 3} | |
# -------------------------------------------------- # | |
fasta_data = read_fasta_file(fasta_filepath) | |
aa_min_occurrences_txt = "_".join(f"{aa}{aa_min_occurrences[aa]}" for aa in aa_min_occurrences) | |
output_motifs_fd = open(f"./data/motifs_W{window_size}_S{slide_size}_{aa_min_occurrences_txt}.txt", "w") | |
protein_match_count = 0 | |
for p in fasta_data: | |
motifs_list = find_motifs_with_pattern(fasta_data[p], window_size, slide_size, aa_min_occurrences) | |
if len(motifs_list) > 0: | |
protein_match_count += 1 | |
nb_motifs = len(motifs_list) | |
for idx, (start, end, motif) in enumerate(motifs_list): | |
start_end_txt = f"{start:04d}:{end:04d}" | |
idx_txt = f"(P{protein_match_count}:M{idx+1}/{nb_motifs})" | |
output_motifs_fd.write(f" {idx_txt:15s} - {p:15s} - {start_end_txt:12s} - {motif}\n") | |
output_motifs_fd.close() | |
def read_fasta_file(filepath: str) -> dict: | |
""" | |
Loads a fasta file and returns a dictionary of sequences. | |
""" | |
fasta_sequences = {} | |
with open(filepath) as fasta_file: | |
for line in fasta_file: | |
if line.startswith(">"): | |
sequence_name = line.strip()[1:] | |
fasta_sequences[sequence_name] = "" | |
else: | |
fasta_sequences[sequence_name] += line.strip() | |
return fasta_sequences | |
def find_motifs_with_pattern(sequence: str, window_size: int, slide_size: int, aa_min_occurrences: Dict) -> list: | |
""" | |
Finds all motifs with a specific amino acid occurrences in a sequences. | |
""" | |
motifs_list = [] | |
for idx in range(0, len(sequence) - window_size, slide_size): | |
start, end = idx, idx + window_size | |
motif = sequence[start:end] | |
aa_counts = Counter(motif) | |
focus_aa_counts = [ | |
1 for aa in aa_min_occurrences | |
if aa in aa_min_occurrences and aa_counts[aa] >= aa_min_occurrences[aa] | |
] | |
if len(focus_aa_counts) == len(aa_min_occurrences): | |
motifs_list.append((start, end, motif)) | |
return motifs_list | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment