Skip to content

Instantly share code, notes, and snippets.

@samehkamaleldin
Created August 19, 2022 16:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save samehkamaleldin/b22245d92cf2c8e77f12544c0a03ddfb to your computer and use it in GitHub Desktop.
Save samehkamaleldin/b22245d92cf2c8e77f12544c0a03ddfb to your computer and use it in GitHub Desktop.
GR Scan
from collections import Counter
from typing import Dict
def main():
# ----------------- Configuration ----------------- #
fasta_filepath = "./data/Hsym_primary_v1.0.aa.fasta"
window_size = 16
slide_size = 4
aa_min_occurrences = {'R': 5, 'G': 3}
# -------------------------------------------------- #
fasta_data = read_fasta_file(fasta_filepath)
aa_min_occurrences_txt = "_".join(f"{aa}{aa_min_occurrences[aa]}" for aa in aa_min_occurrences)
output_motifs_fd = open(f"./data/motifs_W{window_size}_S{slide_size}_{aa_min_occurrences_txt}.txt", "w")
protein_match_count = 0
for p in fasta_data:
motifs_list = find_motifs_with_pattern(fasta_data[p], window_size, slide_size, aa_min_occurrences)
if len(motifs_list) > 0:
protein_match_count += 1
nb_motifs = len(motifs_list)
for idx, (start, end, motif) in enumerate(motifs_list):
start_end_txt = f"{start:04d}:{end:04d}"
idx_txt = f"(P{protein_match_count}:M{idx+1}/{nb_motifs})"
output_motifs_fd.write(f" {idx_txt:15s} - {p:15s} - {start_end_txt:12s} - {motif}\n")
output_motifs_fd.close()
def read_fasta_file(filepath: str) -> dict:
"""
Loads a fasta file and returns a dictionary of sequences.
"""
fasta_sequences = {}
with open(filepath) as fasta_file:
for line in fasta_file:
if line.startswith(">"):
sequence_name = line.strip()[1:]
fasta_sequences[sequence_name] = ""
else:
fasta_sequences[sequence_name] += line.strip()
return fasta_sequences
def find_motifs_with_pattern(sequence: str, window_size: int, slide_size: int, aa_min_occurrences: Dict) -> list:
"""
Finds all motifs with a specific amino acid occurrences in a sequences.
"""
motifs_list = []
for idx in range(0, len(sequence) - window_size, slide_size):
start, end = idx, idx + window_size
motif = sequence[start:end]
aa_counts = Counter(motif)
focus_aa_counts = [
1 for aa in aa_min_occurrences
if aa in aa_min_occurrences and aa_counts[aa] >= aa_min_occurrences[aa]
]
if len(focus_aa_counts) == len(aa_min_occurrences):
motifs_list.append((start, end, motif))
return motifs_list
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment