Created
April 3, 2024 14:34
-
-
Save victorsanchezarevalo/b5bcb26cc2e96ff17ea5933a96d1bad0 to your computer and use it in GitHub Desktop.
Challenge-2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
from pathlib import Path | |
def find_motifs_from_file(file_path, kmer_length=6): | |
""" | |
Reads a text file with DNA sequences, one per line, using pathlib and finds the most frequent k-mers. | |
Parameters: | |
- file_path (str): The path to the text file containing the DNA sequences. | |
- kmer_length (int): The length of the k-mers to consider in the motif search. | |
Returns: | |
- list: A list of tuples, where each tuple contains a k-mer and its frequency. | |
""" | |
path = Path(file_path) | |
# Check if the file exists | |
if not path.exists(): | |
raise FileNotFoundError(f"The file '{file_path}' was not found.") | |
# Read the sequences from the file using read_text() | |
sequences = path.read_text().splitlines() | |
# Count the frequency of each k-mer | |
kmer_counts = defaultdict(int) | |
for seq in sequences: | |
for i in range(len(seq) - kmer_length + 1): | |
kmer = seq[i:i+kmer_length] | |
kmer_counts[kmer] += 1 | |
# Sort the k-mers by frequency and select the top 5 | |
sorted_kmers = sorted(kmer_counts.items(), key=lambda x: x[1], reverse=True) | |
top_kmers = sorted_kmers[:5] # Adjust as necessary | |
return top_kmers | |
path = Path("./reads.txt") | |
consensus_candidates = find_motifs_from_file(path, kmer_length=6) | |
for kmer, count in consensus_candidates: | |
print(kmer, count) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment