mavc/boyermoore.py

## boyermoore.py

'''
Boyer-Moore exact string search, implemented from the description
given in:
http://www.inf.fh-flensburg.de/lang/algorithmen/pattern/bmen.htm

'''

def _bad_character_shift(pat):
    ''' Calculates the bad character shift as the distance of the
    rightmost occurance of a character to the end (without counting
    the last character of the string).


    >>> b = _bad_character_shift('ampanman')
    >>> sorted(b.items())
    [('a', 1), ('m', 2), ('n', 3), ('p', 5)]

    '''
    shift = {}

    # for every character in the string, record its rightmost occurence
    # as distance from the last character.
    m = len(pat) - 1

    for i, ch in enumerate(pat[:-1]):
        shift[ch] = m - i

    return shift

def _good_suffix_shift(pat):
    ''' Source: http://www.inf.fh-flensburg.de/lang/algorithmen/pattern/bmen.htm

    >>> c = _good_suffix_shift('anpanman')
    >>> c
    [6, 6, 6, 6, 6, 6, 3, 8, 1]

    '''

    # Starting from the right, compute the position of the widest border
    # of the suffix starting at i.
    #
    # e.g. 0 1 2 3 4 5 6 7
    #      a b b a b a b
    #
    # We get f[2] == 4, for the border "bab" for babab.
    # Position 7 would indicate the empty string.
    #
    # We will calculate the value of shifts in the first loop, by the
    # borders that cannot be extended to the left.
    #
    # Position 2 has border 'bab' beginning at Position 4. Since
    # this border cannot be extended ('bbab' != 'abab'), we store
    # the shift from Position 4 to Position 2 (s[4] = 4-2).

    m = len(pat)
    f = [0] * (m + 1) # borders
    s = [0] * (m + 1) # shifts

    f[m] = j = m + 1
    for i in range(m, 0, -1):
        # Starting from the last element, check if we can extend the border for this
        # character (which starts at j).
        while (j <= m and pat[i-1] != pat[j-1]):
            # If we can't, extend the border, then store the shift in s[j].
            # (We'll need to shift the pattern this number to match suffixes.)
            s[j] = s[j] or j - i
            j = f[j] # For the next iteration, check the earliest suffix minus one
                     # character for a possible suffix match.

        # Now, j-1 is the earliest known border for the previous character.
        j -= 1
        f[i-1] = j

    # "Case 2"
    # In the second loop, we consider that for each pattern, a suffix can occur at the
    # beginning of the string. We need to be able to store shifts for these cases as well.
    # Note: The case when a pattern is not a substring of the str to be processed is a special
    # case of "Case 2".

    # For each index for which s[i] is empty:
    # Upon a mismatch at i, we know that there is not shift that will align a candidate pattern
    # within the string. Therefore, we shift until the head of the string matches the matching
    # suffix (after the mismatch position).

    # For i=0...f[0], the shift value is given by f[0], since that is the position of the widest
    # border of the whole string. Therefore, shifting that value will match the head of the string
    # with the suffix.

    j = f[0]
    for i in range(m + 1):
        s[i] = s[i] or j
        # However, when i < j, we have to take the next shortest border (since we will not have
        # already matched the pattern pat[j:]
        if (i == j): j = f[i]

    # Now we're done! Now, s[i] represents how the good suffix shift for a match of pat[i:] that
    # differs at position i - 1. (and s[0] represents the good suffix shift for a complete match).
    return s


def index(s, pat, start=0):
    ''' Returns the index of the first occurence of pat within s

    >>> index('hello', 'goodbye')
    -1

    >>> index('hello', 'hello')
    0

    >>> index('abracadabra', 'abra')
    0

    >>> index('abracadabra', 'abra', start=1)
    7

    >>> index('abracadabra', 'abra', start=7)
    7

    >>> index('abracadabra', 'abra', start=8)
    -1


    '''
    if len(s) < len(pat) + start: return -1

    m = len(pat)
    n = len(s)
    bad = _bad_character_shift(pat)
    suf = _good_suffix_shift(pat)

    i = start
    while (i + m <= n):

        for j, ch in enumerate(reversed(s[i:i+m])):
            if (ch != pat[-1-j]):
                # Upon a mismatch, advance forward the max of the
                # good suffix and bad character shifts.
                bshift = (bad[ch] if ch in bad else m) - j # can be negative.
                sshift = suf[m-j]
                i += max(bshift, sshift)
                break
        else:
            return i

    # (Not found).
    return -1

def indexall(s, pat, start=0):
    ''' Returns the indices of all occurences of pat within s

    >>> indexall('hello', 'goodbye')
    []

    >>> indexall('hello', 'hello')
    [0]

    >>> indexall('hello', 'l')
    [2, 3]

    >>> indexall('abracadabra', 'abra')
    [0, 7]

    '''
    if len(s) < len(pat) + start: return []

    m = len(pat)
    n = len(s)
    bad = _bad_character_shift(pat)
    suf = _good_suffix_shift(pat)
    indices = []

    i = start
    while (i + m <= n):

        for j, ch in enumerate(reversed(s[i:i+m])):
            if (ch != pat[-1-j]):
                # Upon a mismatch, advance forward the max of the
                # good suffix and bad character shifts.
                bshift = bad[ch] if ch in bad else m
                sshift = suf[m-j]
                i += max(bshift, sshift)
                break
        else:
            indices.append(i)
            i += suf[0]

    return indices

if __name__ == '__main__':
    import doctest
    doctest.testmod()

	'''
	Boyer-Moore exact string search, implemented from the description
	given in:
	http://www.inf.fh-flensburg.de/lang/algorithmen/pattern/bmen.htm

	'''

	def _bad_character_shift(pat):
	''' Calculates the bad character shift as the distance of the
	rightmost occurance of a character to the end (without counting
	the last character of the string).


	>>> b = _bad_character_shift('ampanman')
	>>> sorted(b.items())
	[('a', 1), ('m', 2), ('n', 3), ('p', 5)]

	'''
	shift = {}

	# for every character in the string, record its rightmost occurence
	# as distance from the last character.
	m = len(pat) - 1

	for i, ch in enumerate(pat[:-1]):
	shift[ch] = m - i

	return shift

	def _good_suffix_shift(pat):
	''' Source: http://www.inf.fh-flensburg.de/lang/algorithmen/pattern/bmen.htm

	>>> c = _good_suffix_shift('anpanman')
	>>> c
	[6, 6, 6, 6, 6, 6, 3, 8, 1]

	'''

	# Starting from the right, compute the position of the widest border
	# of the suffix starting at i.
	#
	# e.g. 0 1 2 3 4 5 6 7
	# a b b a b a b
	#
	# We get f[2] == 4, for the border "bab" for babab.
	# Position 7 would indicate the empty string.
	#
	# We will calculate the value of shifts in the first loop, by the
	# borders that cannot be extended to the left.
	#
	# Position 2 has border 'bab' beginning at Position 4. Since
	# this border cannot be extended ('bbab' != 'abab'), we store
	# the shift from Position 4 to Position 2 (s[4] = 4-2).

	m = len(pat)
	f = [0] * (m + 1) # borders
	s = [0] * (m + 1) # shifts

	f[m] = j = m + 1
	for i in range(m, 0, -1):
	# Starting from the last element, check if we can extend the border for this
	# character (which starts at j).
	while (j <= m and pat[i-1] != pat[j-1]):
	# If we can't, extend the border, then store the shift in s[j].
	# (We'll need to shift the pattern this number to match suffixes.)
	s[j] = s[j] or j - i
	j = f[j] # For the next iteration, check the earliest suffix minus one
	# character for a possible suffix match.

	# Now, j-1 is the earliest known border for the previous character.
	j -= 1
	f[i-1] = j

	# "Case 2"
	# In the second loop, we consider that for each pattern, a suffix can occur at the
	# beginning of the string. We need to be able to store shifts for these cases as well.
	# Note: The case when a pattern is not a substring of the str to be processed is a special
	# case of "Case 2".

	# For each index for which s[i] is empty:
	# Upon a mismatch at i, we know that there is not shift that will align a candidate pattern
	# within the string. Therefore, we shift until the head of the string matches the matching
	# suffix (after the mismatch position).

	# For i=0...f[0], the shift value is given by f[0], since that is the position of the widest
	# border of the whole string. Therefore, shifting that value will match the head of the string
	# with the suffix.

	j = f[0]
	for i in range(m + 1):
	s[i] = s[i] or j
	# However, when i < j, we have to take the next shortest border (since we will not have
	# already matched the pattern pat[j:]
	if (i == j): j = f[i]

	# Now we're done! Now, s[i] represents how the good suffix shift for a match of pat[i:] that
	# differs at position i - 1. (and s[0] represents the good suffix shift for a complete match).
	return s


	def index(s, pat, start=0):
	''' Returns the index of the first occurence of pat within s

	>>> index('hello', 'goodbye')
	-1

	>>> index('hello', 'hello')
	0

	>>> index('abracadabra', 'abra')
	0

	>>> index('abracadabra', 'abra', start=1)
	7

	>>> index('abracadabra', 'abra', start=7)
	7

	>>> index('abracadabra', 'abra', start=8)
	-1


	'''
	if len(s) < len(pat) + start: return -1

	m = len(pat)
	n = len(s)
	bad = _bad_character_shift(pat)
	suf = _good_suffix_shift(pat)

	i = start
	while (i + m <= n):

	for j, ch in enumerate(reversed(s[i:i+m])):
	if (ch != pat[-1-j]):
	# Upon a mismatch, advance forward the max of the
	# good suffix and bad character shifts.
	bshift = (bad[ch] if ch in bad else m) - j # can be negative.
	sshift = suf[m-j]
	i += max(bshift, sshift)
	break
	else:
	return i

	# (Not found).
	return -1

	def indexall(s, pat, start=0):
	''' Returns the indices of all occurences of pat within s

	>>> indexall('hello', 'goodbye')
	[]

	>>> indexall('hello', 'hello')
	[0]

	>>> indexall('hello', 'l')
	[2, 3]

	>>> indexall('abracadabra', 'abra')
	[0, 7]

	'''
	if len(s) < len(pat) + start: return []

	m = len(pat)
	n = len(s)
	bad = _bad_character_shift(pat)
	suf = _good_suffix_shift(pat)
	indices = []

	i = start
	while (i + m <= n):

	for j, ch in enumerate(reversed(s[i:i+m])):
	if (ch != pat[-1-j]):
	# Upon a mismatch, advance forward the max of the
	# good suffix and bad character shifts.
	bshift = bad[ch] if ch in bad else m
	sshift = suf[m-j]
	i += max(bshift, sshift)
	break
	else:
	indices.append(i)
	i += suf[0]

	return indices

	if __name__ == '__main__':
	import doctest
	doctest.testmod()