Skip to content

Instantly share code, notes, and snippets.

@calizarr
Last active December 27, 2015 21:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save calizarr/7393194 to your computer and use it in GitHub Desktop.
Save calizarr/7393194 to your computer and use it in GitHub Desktop.
Find k-mers in (L,t) clumps.
def allBelow(genome,k,L,t):
kmerCount = 0
## print 'Beginning: '
results = set()
for x in range(len(genome)-k): #looping through the genome
kMer = genome[x:x+k] #getting each k-mer at a time
indices = collections.deque([]) #keeping track of the indices
start = 0 #for the string.find(sub) method
if kMer not in results: #Making sure we don't do extra work.
while True:
if len(indices)>=t: #Check as we go across the genome.
flag = False #Needed for the double break out of the loop.
for ind in range(len(indices)):
end = ind+t-1
if end<=len(indices)-1 and indices[end]<=indices[ind]+L:
results.add(kMer)
kmerCount +=1
print 'Found a kmer! kMers at: '+str(kmerCount)
flag = True
break
else:
indices.popleft()
if flag: break
start = genome.find(kMer,start)
if start == -1: break
indices.append(start)
start +=1
return results
# Initial Algorithm.
def findKmer(genome,k,t):
kMers = {}
for x in range(len(genome)-k):
kMer = genome[x:x+k]
indices = patternMatch(kMer,genome)
if len(indices)>=t and kMer not in kMers:
kMers[kMer]=indices
return kMers
def patternMatch(pattern,genome):
indices = []
start = 0
while True:
start = genome.find(pattern,start)
if start == -1: return indices
indices.append(start)
start +=1
def findClump(kMers,L,t):
results = set()
for kmer in kMers:
indices = kMers[kmer]
for ind in range(len(indices)):
end = ind+t-1
if end<=len(indices)-1 and indices[end]<=indices[ind]+L:
results.add(kmer)
return results
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment