Skip to content

Instantly share code, notes, and snippets.

@trolldbois
Created October 16, 2011 16:48
Show Gist options
  • Save trolldbois/1291126 to your computer and use it in GitHub Desktop.
Save trolldbois/1291126 to your computer and use it in GitHub Desktop.
findPattern in sequence
def findPattern(sequence, elSize=1, minNbGroup=2):
'''
returns a regexp grouping repetitive patterns.
@param sequence: a sequence (str/bstr) with rfind() method.
@param elsize: the size of each element ( 1 to xxx ) in the sequence.
@param minNbGroup: the minimum number of repetition before trying to group the pattern.
Examples:
>>> s = 'aaaaa1111bbbccda2a2a2a2a2b1cb1cb1cb1cabcdabcdabcdabcdpooiiiuuuuyyyyy'
>>> pattern.findPattern(s,1)
' (a){5} (1){4} (b){3} (c){2} d (a2){5} (b1c){4} (abcd){4} p (o){2} (i){3} (u){4} (y){5} '
>>> s = 'aaaaa1111bbbccda2a2a2a2a2b1cb1cb1cb1cabcdabcdabcdabcdpooiiiuuuuyyyyy'
>>> pattern.findPattern(s,1,5)
' (a){5} 1111bbbccd (a2){5} b1cb1cb1cb1cabcdabcdabcdabcdpooiiiuuuu (y){5} '
'''
if (len(sequence) % elSize ) != 0:
raise ValueError('your sequence length:%d has to be a multiple of element size:%d'%(len(sequence),elSize))
patterns=[]
for seqlen in range(elSize, 1+(len(sequence)/2)):
seqs = [ sequence[i:i+seqlen] for i in xrange(0, len(sequence)-seqlen+1, elSize) ] # i %elSize, aligned on the elSize
for value,nb in collections.Counter(seqs).most_common():
while nb >= minNbGroup: # try repetition as long as it is > to minNbGroup
ind = sequence.rfind( value*nb ) # find the fulltext pattern
while ind != -1: # not found
patterns.append((nb*len(value), ind ,nb, value)) # biggest is best, ind++ is better, large nb best
ind = sequence.rfind(value*nb, 0, ind) # find it at another offset
nb-=1 # try with a smaller number of repetition
#
if len(patterns) == 0:
return sequence
patterns=list(set(patterns))
patterns.sort() # the fitness attribute is (length of pattern, indice, nb of repetition, pattern repeted)
best = patterns[-1] # higher wins
#print 'BEST:', best, best[0], best[3][:elSize], best[3][elSize:]
#print 'found new patterns :'
#for p in patterns:
# sequence2 = sequence.replace( p[3]*p[2], ' (%s){%d} '%(p[3],p[2]) )
# print p, sequence2
i = sequence.find(best[3]*best[2])
left = sequence[:i]
right = sequence[i+best[0]:]
ret = findPattern( left , elSize, minNbGroup)
ret2 = findPattern( right , elSize, minNbGroup)
return '%s (%s){%d} %s'%(ret,best[3],best[2],ret2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment