Skip to content

Instantly share code, notes, and snippets.

@mavc
Created November 1, 2011 03:32
Show Gist options
  • Save mavc/1329810 to your computer and use it in GitHub Desktop.
Save mavc/1329810 to your computer and use it in GitHub Desktop.
Boyer-Moore string search
'''
Boyer-Moore exact string search, implemented from the description
given in:
http://www.inf.fh-flensburg.de/lang/algorithmen/pattern/bmen.htm
'''
def _bad_character_shift(pat):
''' Calculates the bad character shift as the distance of the
rightmost occurance of a character to the end (without counting
the last character of the string).
>>> b = _bad_character_shift('ampanman')
>>> sorted(b.items())
[('a', 1), ('m', 2), ('n', 3), ('p', 5)]
'''
shift = {}
# for every character in the string, record its rightmost occurence
# as distance from the last character.
m = len(pat) - 1
for i, ch in enumerate(pat[:-1]):
shift[ch] = m - i
return shift
def _good_suffix_shift(pat):
''' Source: http://www.inf.fh-flensburg.de/lang/algorithmen/pattern/bmen.htm
>>> c = _good_suffix_shift('anpanman')
>>> c
[6, 6, 6, 6, 6, 6, 3, 8, 1]
'''
# Starting from the right, compute the position of the widest border
# of the suffix starting at i.
#
# e.g. 0 1 2 3 4 5 6 7
# a b b a b a b
#
# We get f[2] == 4, for the border "bab" for babab.
# Position 7 would indicate the empty string.
#
# We will calculate the value of shifts in the first loop, by the
# borders that cannot be extended to the left.
#
# Position 2 has border 'bab' beginning at Position 4. Since
# this border cannot be extended ('bbab' != 'abab'), we store
# the shift from Position 4 to Position 2 (s[4] = 4-2).
m = len(pat)
f = [0] * (m + 1) # borders
s = [0] * (m + 1) # shifts
f[m] = j = m + 1
for i in range(m, 0, -1):
# Starting from the last element, check if we can extend the border for this
# character (which starts at j).
while (j <= m and pat[i-1] != pat[j-1]):
# If we can't, extend the border, then store the shift in s[j].
# (We'll need to shift the pattern this number to match suffixes.)
s[j] = s[j] or j - i
j = f[j] # For the next iteration, check the earliest suffix minus one
# character for a possible suffix match.
# Now, j-1 is the earliest known border for the previous character.
j -= 1
f[i-1] = j
# "Case 2"
# In the second loop, we consider that for each pattern, a suffix can occur at the
# beginning of the string. We need to be able to store shifts for these cases as well.
# Note: The case when a pattern is not a substring of the str to be processed is a special
# case of "Case 2".
# For each index for which s[i] is empty:
# Upon a mismatch at i, we know that there is not shift that will align a candidate pattern
# within the string. Therefore, we shift until the head of the string matches the matching
# suffix (after the mismatch position).
# For i=0...f[0], the shift value is given by f[0], since that is the position of the widest
# border of the whole string. Therefore, shifting that value will match the head of the string
# with the suffix.
j = f[0]
for i in range(m + 1):
s[i] = s[i] or j
# However, when i < j, we have to take the next shortest border (since we will not have
# already matched the pattern pat[j:]
if (i == j): j = f[i]
# Now we're done! Now, s[i] represents how the good suffix shift for a match of pat[i:] that
# differs at position i - 1. (and s[0] represents the good suffix shift for a complete match).
return s
def index(s, pat, start=0):
''' Returns the index of the first occurence of pat within s
>>> index('hello', 'goodbye')
-1
>>> index('hello', 'hello')
0
>>> index('abracadabra', 'abra')
0
>>> index('abracadabra', 'abra', start=1)
7
>>> index('abracadabra', 'abra', start=7)
7
>>> index('abracadabra', 'abra', start=8)
-1
'''
if len(s) < len(pat) + start: return -1
m = len(pat)
n = len(s)
bad = _bad_character_shift(pat)
suf = _good_suffix_shift(pat)
i = start
while (i + m <= n):
for j, ch in enumerate(reversed(s[i:i+m])):
if (ch != pat[-1-j]):
# Upon a mismatch, advance forward the max of the
# good suffix and bad character shifts.
bshift = (bad[ch] if ch in bad else m) - j # can be negative.
sshift = suf[m-j]
i += max(bshift, sshift)
break
else:
return i
# (Not found).
return -1
def indexall(s, pat, start=0):
''' Returns the indices of all occurences of pat within s
>>> indexall('hello', 'goodbye')
[]
>>> indexall('hello', 'hello')
[0]
>>> indexall('hello', 'l')
[2, 3]
>>> indexall('abracadabra', 'abra')
[0, 7]
'''
if len(s) < len(pat) + start: return []
m = len(pat)
n = len(s)
bad = _bad_character_shift(pat)
suf = _good_suffix_shift(pat)
indices = []
i = start
while (i + m <= n):
for j, ch in enumerate(reversed(s[i:i+m])):
if (ch != pat[-1-j]):
# Upon a mismatch, advance forward the max of the
# good suffix and bad character shifts.
bshift = bad[ch] if ch in bad else m
sshift = suf[m-j]
i += max(bshift, sshift)
break
else:
indices.append(i)
i += suf[0]
return indices
if __name__ == '__main__':
import doctest
doctest.testmod()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment