Skip to content

Instantly share code, notes, and snippets.

@victorsanchezarevalo
Created April 3, 2024 14:34
Show Gist options
  • Save victorsanchezarevalo/b5bcb26cc2e96ff17ea5933a96d1bad0 to your computer and use it in GitHub Desktop.
Save victorsanchezarevalo/b5bcb26cc2e96ff17ea5933a96d1bad0 to your computer and use it in GitHub Desktop.
Challenge-2
from collections import defaultdict
from pathlib import Path
def find_motifs_from_file(file_path, kmer_length=6):
"""
Reads a text file with DNA sequences, one per line, using pathlib and finds the most frequent k-mers.
Parameters:
- file_path (str): The path to the text file containing the DNA sequences.
- kmer_length (int): The length of the k-mers to consider in the motif search.
Returns:
- list: A list of tuples, where each tuple contains a k-mer and its frequency.
"""
path = Path(file_path)
# Check if the file exists
if not path.exists():
raise FileNotFoundError(f"The file '{file_path}' was not found.")
# Read the sequences from the file using read_text()
sequences = path.read_text().splitlines()
# Count the frequency of each k-mer
kmer_counts = defaultdict(int)
for seq in sequences:
for i in range(len(seq) - kmer_length + 1):
kmer = seq[i:i+kmer_length]
kmer_counts[kmer] += 1
# Sort the k-mers by frequency and select the top 5
sorted_kmers = sorted(kmer_counts.items(), key=lambda x: x[1], reverse=True)
top_kmers = sorted_kmers[:5] # Adjust as necessary
return top_kmers
path = Path("./reads.txt")
consensus_candidates = find_motifs_from_file(path, kmer_length=6)
for kmer, count in consensus_candidates:
print(kmer, count)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment