Created
August 15, 2015 13:04
-
-
Save kdm9/531e61f3949759986868 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# given a sequence find kmers of a given length that occur equal to t or more | |
# times in a given length of window in the sequence | |
from collections import deque | |
def kmer_frequency(text, k, window, thresh=3): | |
counts = dict() | |
kmers = deque() | |
for w in range(len(text)-window+1): | |
win = text[w:w+window] | |
freqs = set() | |
if w == 0: | |
for i in range(len(win)-k+1): | |
word = win[i:i+k] | |
kmers.append(word) | |
counts[word] = counts.get(word, 0) + 1 | |
if counts[word] >= thresh: | |
freqs.add(word) | |
else: | |
word = win[-k:] | |
kmers.append(word) | |
counts[word] = counts.get(word, 0) + 1 | |
word_to_rm = kmers.popleft() | |
counts[word_to_rm] -= 1 | |
if counts[word_to_rm] == 0: | |
counts.pop(word_to_rm, None) | |
if counts[word] >= thresh: | |
freqs.add(word) | |
yield freqs | |
with open("E-coli.txt", "r") as myfile: | |
text = myfile.read().replace('\n', '') | |
# text = raw_input('Paste sequence: ') | |
k = raw_input('Paste kmer size: ') | |
L = raw_input('Paste length of window: ') | |
t = raw_input('Paste the min num times a kmer should be seen: ') | |
lenk = int(k) | |
window = int(L) | |
times = int(t) | |
uniq = set() | |
for freqs in kmer_frequency(text, lenk, window, times): | |
uniq.update(freqs) | |
print len(uniq) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment