Created
January 14, 2012 08:55
-
-
Save Kesin11/1610749 to your computer and use it in GitHub Desktop.
Simple SuffixArray (Python)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding: utf-8 | |
''' | |
単純なSuffixArrayの実装 | |
部分文字列の検索 | |
Ngram数え上げ | |
''' | |
def multikeysort(string, indices, cmpindex=0): | |
"""単純なマルチキークイックソート | |
文字列のインデックスを使用したソート""" | |
if not indices: | |
return [] | |
pivot = indices[0] | |
left=[] | |
mid=[] | |
right=[] | |
#cmpindex番目の文字を比較 | |
for index in (indices[i] for i in xrange(1, len(indices))): | |
if string[index+cmpindex] < string[pivot+cmpindex]: | |
left.append(index) | |
elif string[index+cmpindex] == string[pivot+cmpindex]: | |
mid.append(index) | |
else: | |
right.append(index) | |
mid.append(pivot) | |
#番兵"$"によるIndexError防止と計算量軽減 | |
if not string[pivot+cmpindex] == '$' and not len(mid) < 2: | |
mid = multikeysort(string, mid, cmpindex+1) | |
left = multikeysort(string, left, cmpindex) | |
right = multikeysort(string, right, cmpindex) | |
return left + mid + right | |
def get_suffixarray(string): | |
"""文字列からSuffixArrayを構築""" | |
indices = xrange(len(string)) | |
return multikeysort(string, indices) | |
def binary_search(string, suffixarray, query): | |
"""部分文字列を見つけたときのmidを返す""" | |
endindex = len(query) | |
stringlen = len(string) | |
min = 0 | |
max = len(suffixarray)-1 | |
while True: | |
mid = (min+max) / 2 | |
suffix = suffixarray[mid] | |
#IndexError防止 | |
if not suffix + endindex > stringlen: | |
cmpstring = string[suffix:suffix+endindex] | |
if max < min: | |
return -1 | |
elif cmpstring < query: | |
min = mid + 1 | |
elif cmpstring > query: | |
max = mid - 1 | |
#elif string[suffix][:endindex] == query: | |
else: | |
return mid | |
def search_all(string, suffixarray, query): | |
"""部分文字列の位置を全て検索 | |
見つかれば接尾辞要素番号""" | |
endindex = len(query) | |
mid = binary_search(string, suffixarray, query) | |
left = mid | |
right = mid + 1 | |
while True: | |
suffix = suffixarray[left] | |
cmpstring = string[suffix:suffix+endindex] | |
if cmpstring == query: | |
yield suffix | |
left -= 1 | |
else: | |
break | |
while True: | |
suffix = suffixarray[right] | |
cmpstring = string[suffix:suffix+endindex] | |
if cmpstring == query: | |
yield suffix | |
right += 1 | |
else: | |
break | |
def count_ngram(string, suffixarray, n): | |
"""n-gramを数えるイテレータ""" | |
#SuffixArrayからngram以上の文字列を取得するジェネレータ。"$"を除くので-1 | |
cmp_strings = (string[i:i+n] for i in suffixarray if not i+n > len(string)-1) | |
#初期化 | |
count_string = cmp_strings.next() | |
count = 1 | |
for cmp_string in cmp_strings: | |
if count_string == cmp_string: | |
count += 1 | |
else: | |
yield (count_string, count) | |
count_string = cmp_string | |
count = 1 | |
yield (count_string, count) | |
#SuffixArrayの構築 | |
>>> string = 'abracadabra$' | |
>>> suffixarray = get_suffixarray(string) | |
>>> for suffix in [string[i:] for i in suffixarray]: | |
... print suffix | |
$ | |
a$ | |
abra$ | |
abracadabra$ | |
acadabra$ | |
adabra$ | |
bra$ | |
bracadabra$ | |
cadabra$ | |
dabra$ | |
ra$ | |
racadabra$ | |
>>> string = u"イカちゃんかわいいイカちゃん" + "$" | |
>>> suffixarray = get_suffixarray(string) | |
>>> for suffix in [string[i:] for i in suffixarray]: | |
... print suffix | |
... | |
$ | |
いいイカちゃん$ | |
いイカちゃん$ | |
かわいいイカちゃん$ | |
ちゃん$ | |
ちゃんかわいいイカちゃん$ | |
ゃん$ | |
ゃんかわいいイカちゃん$ | |
わいいイカちゃん$ | |
ん$ | |
んかわいいイカちゃん$ | |
イカちゃん$ | |
イカちゃんかわいいイカちゃん$ | |
カちゃん$ | |
カちゃんかわいいイカちゃん$ | |
#部分文字列の検索 | |
>>> query = u'イカ' | |
>>> print [i for i in search_all(string, suffixarray, query)] | |
[9, 0] | |
#Ngram数え上げ | |
>>> for word, n in count_ngram(string, suffixarray, 2): | |
... print word, n | |
... | |
いい 1 | |
いイ 1 | |
かわ 1 | |
ちゃ 2 | |
ゃん 2 | |
わい 1 | |
んか 1 | |
イカ 2 | |
カち 2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment