Last active
August 29, 2015 14:12
-
-
Save smrmkt/fac2201de9b515bc6b37 to your computer and use it in GitHub Desktop.
Burrows Wheeler Transform
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#-*-coding:utf-8-*- | |
import argparse | |
# args | |
parser = argparse.ArgumentParser() | |
parser.add_argument('text') | |
def bwt(t): | |
# ソート済みの接尾辞配列を作成 | |
# 文字列の終端を表す文字として$を追加しておく | |
t += '$' | |
sa = sorted([t[i:] for i in range(len(t))]) | |
# BW変換を行った結果を返す | |
# 接尾辞が文字列全体と等しい場合は,文字列の最後の文字を使う | |
return ''.join([t[-len(s)-1] if len(s) < len(t) else t[-1] for s in sa]) | |
def inv_bwt(t): | |
# 終端文字の位置を取得 | |
p = t.index('$') | |
# 各文字の出現回数をカウント | |
c = {s: t.count(s) for s in set(list(t))} | |
# 辞書順に並べたときの各文字より前にある文字の合計個数を算出 | |
# LFマッピングの右辺第2項を出すのに必要 | |
total = 0 | |
for k in sorted(c): | |
cur = c[k] | |
c[k] = total | |
total += cur | |
# LFのマッピングを行う | |
lf_mapping = [t[:i+1].count(t[i])+c[t[i]]-1 for i in range(len(t))] | |
# 終端文字から逆順にたどる | |
ret = '' | |
for i in range(len(t)-1): | |
ret = t[lf_mapping[p]]+ret | |
p = lf_mapping[p] | |
return ret | |
if __name__ == '__main__': | |
text = parser.parse_args().text | |
l = bwt(text) | |
print l | |
print inv_bwt(l) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment