Skip to content

Instantly share code, notes, and snippets.

@kdm9
Created August 15, 2015 13:04
Show Gist options
  • Save kdm9/531e61f3949759986868 to your computer and use it in GitHub Desktop.
Save kdm9/531e61f3949759986868 to your computer and use it in GitHub Desktop.
# given a sequence find kmers of a given length that occur equal to t or more
# times in a given length of window in the sequence
from collections import deque
def kmer_frequency(text, k, window, thresh=3):
counts = dict()
kmers = deque()
for w in range(len(text)-window+1):
win = text[w:w+window]
freqs = set()
if w == 0:
for i in range(len(win)-k+1):
word = win[i:i+k]
kmers.append(word)
counts[word] = counts.get(word, 0) + 1
if counts[word] >= thresh:
freqs.add(word)
else:
word = win[-k:]
kmers.append(word)
counts[word] = counts.get(word, 0) + 1
word_to_rm = kmers.popleft()
counts[word_to_rm] -= 1
if counts[word_to_rm] == 0:
counts.pop(word_to_rm, None)
if counts[word] >= thresh:
freqs.add(word)
yield freqs
with open("E-coli.txt", "r") as myfile:
text = myfile.read().replace('\n', '')
# text = raw_input('Paste sequence: ')
k = raw_input('Paste kmer size: ')
L = raw_input('Paste length of window: ')
t = raw_input('Paste the min num times a kmer should be seen: ')
lenk = int(k)
window = int(L)
times = int(t)
uniq = set()
for freqs in kmer_frequency(text, lenk, window, times):
uniq.update(freqs)
print len(uniq)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment