Skip to content

Instantly share code, notes, and snippets.

@Kesin11
Created January 14, 2012 08:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Kesin11/1610749 to your computer and use it in GitHub Desktop.
Save Kesin11/1610749 to your computer and use it in GitHub Desktop.
Simple SuffixArray (Python)
#coding: utf-8
'''
単純なSuffixArrayの実装
部分文字列の検索
Ngram数え上げ
'''
def multikeysort(string, indices, cmpindex=0):
"""単純なマルチキークイックソート
文字列のインデックスを使用したソート"""
if not indices:
return []
pivot = indices[0]
left=[]
mid=[]
right=[]
#cmpindex番目の文字を比較
for index in (indices[i] for i in xrange(1, len(indices))):
if string[index+cmpindex] < string[pivot+cmpindex]:
left.append(index)
elif string[index+cmpindex] == string[pivot+cmpindex]:
mid.append(index)
else:
right.append(index)
mid.append(pivot)
#番兵"$"によるIndexError防止と計算量軽減
if not string[pivot+cmpindex] == '$' and not len(mid) < 2:
mid = multikeysort(string, mid, cmpindex+1)
left = multikeysort(string, left, cmpindex)
right = multikeysort(string, right, cmpindex)
return left + mid + right
def get_suffixarray(string):
"""文字列からSuffixArrayを構築"""
indices = xrange(len(string))
return multikeysort(string, indices)
def binary_search(string, suffixarray, query):
"""部分文字列を見つけたときのmidを返す"""
endindex = len(query)
stringlen = len(string)
min = 0
max = len(suffixarray)-1
while True:
mid = (min+max) / 2
suffix = suffixarray[mid]
#IndexError防止
if not suffix + endindex > stringlen:
cmpstring = string[suffix:suffix+endindex]
if max < min:
return -1
elif cmpstring < query:
min = mid + 1
elif cmpstring > query:
max = mid - 1
#elif string[suffix][:endindex] == query:
else:
return mid
def search_all(string, suffixarray, query):
"""部分文字列の位置を全て検索
見つかれば接尾辞要素番号"""
endindex = len(query)
mid = binary_search(string, suffixarray, query)
left = mid
right = mid + 1
while True:
suffix = suffixarray[left]
cmpstring = string[suffix:suffix+endindex]
if cmpstring == query:
yield suffix
left -= 1
else:
break
while True:
suffix = suffixarray[right]
cmpstring = string[suffix:suffix+endindex]
if cmpstring == query:
yield suffix
right += 1
else:
break
def count_ngram(string, suffixarray, n):
"""n-gramを数えるイテレータ"""
#SuffixArrayからngram以上の文字列を取得するジェネレータ。"$"を除くので-1
cmp_strings = (string[i:i+n] for i in suffixarray if not i+n > len(string)-1)
#初期化
count_string = cmp_strings.next()
count = 1
for cmp_string in cmp_strings:
if count_string == cmp_string:
count += 1
else:
yield (count_string, count)
count_string = cmp_string
count = 1
yield (count_string, count)
#SuffixArrayの構築
>>> string = 'abracadabra$'
>>> suffixarray = get_suffixarray(string)
>>> for suffix in [string[i:] for i in suffixarray]:
... print suffix
$
a$
abra$
abracadabra$
acadabra$
adabra$
bra$
bracadabra$
cadabra$
dabra$
ra$
racadabra$
>>> string = u"イカちゃんかわいいイカちゃん" + "$"
>>> suffixarray = get_suffixarray(string)
>>> for suffix in [string[i:] for i in suffixarray]:
... print suffix
...
$
いいイカちゃん$
いイカちゃん$
かわいいイカちゃん$
ちゃん$
ちゃんかわいいイカちゃん$
ゃん$
ゃんかわいいイカちゃん$
わいいイカちゃん$
ん$
んかわいいイカちゃん$
イカちゃん$
イカちゃんかわいいイカちゃん$
カちゃん$
カちゃんかわいいイカちゃん$
#部分文字列の検索
>>> query = u'イカ'
>>> print [i for i in search_all(string, suffixarray, query)]
[9, 0]
#Ngram数え上げ
>>> for word, n in count_ngram(string, suffixarray, 2):
... print word, n
...
いい 1
いイ 1
かわ 1
ちゃ 2
ゃん 2
わい 1
んか 1
イカ 2
カち 2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment