Last active
October 13, 2022 14:59
-
-
Save HaraldKorneliussen/2bf20ca4f4f28c1aaa917b146cf39d84 to your computer and use it in GitHub Desktop.
A naive python implementation of David A. Scott's bijective Burrows-Wheeler transform + inverse
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# Factor a string into nonincreasing Lyndon words, | |
# using Duval's algorithm | |
def lyndonFactor(s): | |
k = 0 | |
ret = [] | |
while k < len(s): | |
i = k | |
j = i + 1 | |
while j < len(s) and s[i] <= s[j]: | |
if s[i] == s[j]: | |
i += 1 | |
else: | |
i = k | |
j += 1 | |
oldk = k | |
k = k + j - i | |
ret.append(s[oldk:k]) | |
return ret | |
# A special string comparison order must be used in the bijective BWTS: | |
# infinite lexicographical order. It's like regular lexorder, but on the | |
# infinite repeated concatenation of the words. | |
# To do this in python's sorting key scheme, extend all strings to | |
# sufficient length to compare them. | |
def infLexOrderKey(a,l): | |
while len(a) < l: | |
a = a * 2 | |
if len(a) > l: | |
a = a[0:l] | |
return a | |
# Get a list of all the rotations of the Lyndon factors of s | |
def lyndonConjugates(s): | |
factors = lyndonFactor(s) | |
ret = [] | |
for factor in factors: | |
for idx, _ in enumerate(factor): | |
ret.append(factor[-idx:] + factor[:-idx]) | |
return ret | |
# The "suffix standard permutation" induced by a word s | |
# as I understand it, that's the same as the inverse suffix array | |
def standardPermutation(s): | |
return [x[0] for x in sorted(enumerate(s), key=lambda p: p[1])] | |
# Decompose a permutation into a list of cycles. | |
def findCycles(p): | |
allCycles = [] | |
for i in range(len(p)): | |
currentIndex = i | |
currentCycle = [] | |
while p[currentIndex] != -1: | |
newIndex = p[currentIndex] # Start from the second element in cycle, to ease decoding | |
currentCycle.append(newIndex) | |
p[currentIndex] = -1 | |
currentIndex = newIndex | |
if len(currentCycle) > 0: | |
allCycles.append(currentCycle) | |
return allCycles[::-1] # Reverse list of cycles to ease decoding | |
# David A. Scott's Bijective Burrows-Wheeler transform: | |
# take all the rotations of the lyndon factors, | |
# sort them in inflex order, and take the last character of each. | |
def bwts(s): | |
conjs = lyndonConjugates(s) | |
maxlen = len(s) # lcm of all the lengths suffices, but this should be safe. | |
return ''.join( [f[-1] for f in sorted(lyndonConjugates(s), key=lambda a: infLexOrderKey(a, maxlen))]) | |
# The inverse: take the standard permutation induced by s, | |
# find all cycles in it, label them according to s, and append. | |
def ibwts(s): | |
cycles = findCycles(standardPermutation(s)) | |
return ''.join([''.join(map(lambda i: s[i], cycle)) for cycle in cycles]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment