amanuel2/String Algorithms.py

## String Algorithms.py
################################ Robin Karp Rolling Hash (string searching)

class Solution:
    #  Rabin-Karp algorithm
    # In the hashing algo, we can have hash collision and when we have hash collision, 2
    # different words can have the same hash. So we need to check that 2 words with same hash
    # are really the same to avoid false positive.
    # This check frequency can be reduced by taking multiple hashes. In case we take 2 hashes
    # the probability of false negatives can go down to 10^-9. So we need to check only once.
    # Still the time complexity is O(N + M)
    # Can be made more optimum for worst case by using KMP.
    def strStr(self, haystack: str, needle: str) -> int:

        def charToIndex(char): return ord(char) - ord('a')

        def hashString(string):

            base = 26
            modulo = 10**9 + 7
            hash = 0

            # In this case the first char gets the highest weight of base^(n-1)
            for char in string:
                hash = (hash * base + charToIndex(char)) % modulo

            return hash

        def updateHash(hash, removeLetter, addLetter, windowLen):
            base = 26
            modulo = 10**9 + 7

            maxWeight = pow(base, windowLen-1, modulo) # base ** (windowLen - 1)
            return (
                # Letter getting removed was first and had base ^ n - 1 weight
                base * (hash - charToIndex(removeLetter) * maxWeight)
                + charToIndex(addLetter) # New letter add will have 1 weight
                ) % modulo

        if len(needle) > len(haystack): return -1

        if needle == haystack: return 0

        # Find hash for the first m characters for the initial comparision when i = 0
        hashNeedle = hashString(needle)
        hash = hashString(haystack[:len(needle)])

        # O(n+m)
        for i in range(len(haystack) - len(needle) + 1):

            # If the hash is same as needle then confrim if the string in current window matches
            # needles to avoid false positives (diff strings having same hash due to collision).
            if hash == hashNeedle:
                if needle == haystack[i: i + len(needle)]: return i

            # Update hash:
            # When we reach the last window we can't update hash anymore
            if i != len(haystack) - len(needle):
                # Remove first letter from hash and add letter next to current window in hash
                hash = updateHash(hash, haystack[i], haystack[i + len(needle)], len(needle))

        return -1

class Solution:
    def longestDupSubstring(self, S: str) -> str:
        # https://www.youtube.com/watch?v=BfUejqd07yo&t=357s

        mod = 1_000_000_007

        def fn(k):
            """Return duplicated substring of length k."""
            p = pow(26, k, mod)
            hs = 0
            seen = defaultdict(set)
            for i, ch in enumerate(s):
                hs = (26*hs + ord(ch) - 97) % mod
                if i >= k: hs = (hs - (ord(s[i-k])-97)*p) % mod # rolling hash
                if i+1 >= k:
                    if hs in seen and s[i+1-k:i+1] in seen[hs]: return s[i+1-k:i+1] # resolve hash collision
                    seen[hs].add(s[i+1-k:i+1])
            return ""

        lo, hi = 0, len(s)-1
        while lo < hi:
            mid = lo + hi + 1 >> 1
            if fn(mid): lo = mid
            else: hi = mid - 1
        return fn(lo)


################################ KMP Algorithm (string searching)
class Solution:
    def strStr(self, haystack: str, needle: str) -> int:
        # KMP Algorithm: build a table to keep track of patterns so we don't have to restart
        def build_pattern(substring):
            pattern = [0 for _ in substring]
            # two pointers one at the first, next position
            j = 0

            for i in range(1, len(substring)):
                while j > 0 and substring[i] != substring[j]:
                    j = pattern[j - 1]

                if substring[i] == substring[j]:
                    j += 1

                pattern[i] = j

            return pattern


        def does_match(haystack, needle, pattern):
            i = j = 0
            while i < len(haystack):
                if haystack[i] == needle[j]:
                    i += 1
                    j += 1

                    if j == len(needle):
                        return i - j  # Found a match

                else:
                    if j > 0:
                        j = pattern[j - 1]
                    else:
                        i += 1
            return -1


        pattern = build_pattern(needle)
        # print(pattern)
        return does_match(haystack, needle, pattern)
	################################ Robin Karp Rolling Hash (string searching)

	class Solution:
	# Rabin-Karp algorithm
	# In the hashing algo, we can have hash collision and when we have hash collision, 2
	# different words can have the same hash. So we need to check that 2 words with same hash
	# are really the same to avoid false positive.
	# This check frequency can be reduced by taking multiple hashes. In case we take 2 hashes
	# the probability of false negatives can go down to 10^-9. So we need to check only once.
	# Still the time complexity is O(N + M)
	# Can be made more optimum for worst case by using KMP.
	def strStr(self, haystack: str, needle: str) -> int:

	def charToIndex(char): return ord(char) - ord('a')

	def hashString(string):

	base = 26
	modulo = 10**9 + 7
	hash = 0

	# In this case the first char gets the highest weight of base^(n-1)
	for char in string:
	hash = (hash * base + charToIndex(char)) % modulo

	return hash

	def updateHash(hash, removeLetter, addLetter, windowLen):
	base = 26
	modulo = 10**9 + 7

	maxWeight = pow(base, windowLen-1, modulo) # base ** (windowLen - 1)
	return (
	# Letter getting removed was first and had base ^ n - 1 weight
	base * (hash - charToIndex(removeLetter) * maxWeight)
	+ charToIndex(addLetter) # New letter add will have 1 weight
	) % modulo

	if len(needle) > len(haystack): return -1

	if needle == haystack: return 0

	# Find hash for the first m characters for the initial comparision when i = 0
	hashNeedle = hashString(needle)
	hash = hashString(haystack[:len(needle)])

	# O(n+m)
	for i in range(len(haystack) - len(needle) + 1):

	# If the hash is same as needle then confrim if the string in current window matches
	# needles to avoid false positives (diff strings having same hash due to collision).
	if hash == hashNeedle:
	if needle == haystack[i: i + len(needle)]: return i

	# Update hash:
	# When we reach the last window we can't update hash anymore
	if i != len(haystack) - len(needle):
	# Remove first letter from hash and add letter next to current window in hash
	hash = updateHash(hash, haystack[i], haystack[i + len(needle)], len(needle))

	return -1

	class Solution:
	def longestDupSubstring(self, S: str) -> str:
	# https://www.youtube.com/watch?v=BfUejqd07yo&t=357s

	mod = 1_000_000_007

	def fn(k):
	"""Return duplicated substring of length k."""
	p = pow(26, k, mod)
	hs = 0
	seen = defaultdict(set)
	for i, ch in enumerate(s):
	hs = (26*hs + ord(ch) - 97) % mod
	if i >= k: hs = (hs - (ord(s[i-k])-97)*p) % mod # rolling hash
	if i+1 >= k:
	if hs in seen and s[i+1-k:i+1] in seen[hs]: return s[i+1-k:i+1] # resolve hash collision
	seen[hs].add(s[i+1-k:i+1])
	return ""

	lo, hi = 0, len(s)-1
	while lo < hi:
	mid = lo + hi + 1 >> 1
	if fn(mid): lo = mid
	else: hi = mid - 1
	return fn(lo)


	################################ KMP Algorithm (string searching)
	class Solution:
	def strStr(self, haystack: str, needle: str) -> int:
	# KMP Algorithm: build a table to keep track of patterns so we don't have to restart
	def build_pattern(substring):
	pattern = [0 for _ in substring]
	# two pointers one at the first, next position
	j = 0

	for i in range(1, len(substring)):
	while j > 0 and substring[i] != substring[j]:
	j = pattern[j - 1]

	if substring[i] == substring[j]:
	j += 1

	pattern[i] = j

	return pattern


	def does_match(haystack, needle, pattern):
	i = j = 0
	while i < len(haystack):
	if haystack[i] == needle[j]:
	i += 1
	j += 1

	if j == len(needle):
	return i - j # Found a match

	else:
	if j > 0:
	j = pattern[j - 1]
	else:
	i += 1
	return -1



	pattern = build_pattern(needle)
	# print(pattern)
	return does_match(haystack, needle, pattern)