Last active
September 7, 2016 13:43
-
-
Save MitI-7/baaa195c4cde4156db5ce27c4ac733f3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <vector> | |
#include <string> | |
#include <set> | |
#include <algorithm> | |
#include <map> | |
#include <vector> | |
#include <string> | |
#include <set> | |
#include <algorithm> | |
#include <assert.h> | |
using namespace std; | |
class SuffixArray { | |
public: | |
// 接尾辞配列を作成 | |
vector<int> build_sais(vector<int> &sa, const string &s) { | |
assert(sa.size() >= s.size() + 1); | |
assert(s.size() > 1); | |
for (int i = 0; i < sa.size(); ++i) { | |
sa[i] = -1; | |
} | |
// 1. LMS-substringsを求める. | |
const vector<bool> is_type_s = this->make_type_list(s); | |
vector<int> bucket = this->make_bucket_back(s); | |
// LMSの開始indexを各バケツの後ろから詰め込む | |
// LMSはS-type | |
sa[0] = s.size(); // 終端文字 | |
for (int i = 0; i < s.size(); ++i) { | |
if (is_lms_char(i, is_type_s)) { | |
sa[--bucket[s[i]]] = i; | |
} | |
} | |
this->induced_sort(s, sa, is_type_s); | |
// LMS-substringの元の数値列における位置を辞書順に左詰めで格納する | |
int num_lms = 0; | |
for (int i = 0; i < sa.size(); ++i) { | |
if (is_lms_char(sa[i], is_type_s)) { | |
sa[num_lms++] = sa[i]; | |
} | |
} | |
// 辞書順以外を一旦クリア | |
for (int i = num_lms; i < sa.size(); ++i) { | |
sa[i] = -1; | |
} | |
// LMS-substringのidを元の数値列における出現順で空いているところに格納する | |
sa[num_lms + sa[0] / 2] = 0; | |
for (int i = 1; i < num_lms; ++i) { | |
int p = sa[i - 1]; | |
int q = sa[i]; | |
bool same = i != 1; // iが1のときは番兵なので必ず異なる | |
for (int w = 0; same; w++) { | |
if (s[p + w] != s[q + w]) { | |
same = false; | |
} | |
bool p_is_lmc = is_lms_char(p + w, is_type_s); | |
bool q_is_lmc = is_lms_char(q + w, is_type_s); | |
// pもqもLなら続ける | |
if (!p_is_lmc && !q_is_lmc) { | |
continue; | |
} | |
// pもqもSならおわり | |
if (p_is_lmc && q_is_lmc && w != 0) { | |
break; | |
} | |
} | |
if (same) { | |
sa[num_lms + q / 2] = sa[num_lms + p / 2]; | |
} | |
else { | |
sa[num_lms + q / 2] = sa[num_lms + p / 2] + 1; | |
} | |
} | |
// LMS-substringのidを元の数値列における出現順で右詰で格納する | |
int next_alphabet_size = sa[num_lms + sa[num_lms - 1] / 2] + 1; | |
for (int i = sa.size() - 1, p = sa.size() - 1; i >= num_lms; i--) { | |
if (sa[i] != -1) { | |
sa[p--] = sa[i]; | |
} | |
} | |
// 同じLMS-substringがある場合は再帰的に順序を求めるa | |
if (next_alphabet_size != num_lms) { | |
string s1 = ""; | |
for (int i = sa.size() - num_lms; i < sa.size(); ++i) { | |
s1 += to_string(sa[i]); | |
} | |
build_sais(sa, s1); | |
// 右づめにする | |
for (int i = 0; i < num_lms; i++) { | |
sa[sa.size() - num_lms + sa[i + 1]] = i; | |
} | |
int p = 0; | |
for (int i = 0; i < sa.size(); ++i) { | |
if (is_lms_char(i, is_type_s)) { | |
int ax = sa[sa.size() - num_lms + p]; | |
sa[ax] = i; | |
p++; | |
} | |
} | |
} | |
// 辞書順以外を一旦クリア | |
for (int i = num_lms; i < sa.size(); ++i) { | |
sa[i] = -1; | |
} | |
// 配列の末尾から正規の位置に詰めなおす | |
bucket = make_bucket_back(s); | |
for (int i = num_lms - 1; i >= 0; --i) { | |
int c = s[sa[i]]; | |
bucket[c]--; | |
swap(sa[bucket[c]], sa[i]); | |
} | |
this->induced_sort(s, sa, is_type_s); | |
return sa; | |
} | |
// 各文字について,L-typeかS-typeか求める | |
vector<bool> make_type_list(const string &s) { | |
vector<bool> is_type_s(s.size() + 1, true); | |
for (int i = s.size() - 1; i >= 0; --i) { | |
if (s[i] == s[i + 1]) { | |
is_type_s[i] = is_type_s[i + 1]; | |
} | |
else { | |
is_type_s[i] = s[i] < s[i + 1]; | |
} | |
} | |
return is_type_s; | |
} | |
vector<int> make_bucket_back(const string &s) { | |
vector<int> bucket(256); | |
bucket[0] = 1; | |
for (char c : s) { | |
bucket[c]++; | |
} | |
for (int i = 0; i < bucket.size() - 1; ++i) { | |
bucket[i + 1] += bucket[i]; | |
} | |
return bucket; | |
} | |
vector<int> make_bucket_front(const string &s) { | |
vector<int> bucket(256); | |
bucket[0] = 1; | |
for (char c : s) { | |
bucket[c]++; | |
} | |
vector<int> rtn(256); | |
for (int i = 0; i < bucket.size() - 1; ++i) { | |
rtn[i + 1] = rtn[i] + bucket[i]; | |
} | |
return rtn; | |
} | |
bool is_lms_char(int i, const vector<bool> &is_s) { | |
// 終端文字はS | |
if (i == is_s.size()) { | |
return true; | |
} | |
if (0 < i && i < is_s.size()) { | |
return is_s[i] && !is_s[i - 1]; | |
} | |
return false; | |
} | |
void induced_sort(string s, vector<int> &sa, const vector<bool> &is_type_s) { | |
vector<int> bucket; | |
// L-type sort | |
bucket = make_bucket_front(s); | |
for (int i = 0; i < sa.size(); ++i) { | |
int pre_index = sa[i] - 1; | |
// iのひとつ前の文字がL-typeなら | |
if (sa[i] != -1 && sa[i] > 0 && !is_type_s[pre_index]) { | |
char pre_char = s[pre_index]; | |
int insert_index = bucket[pre_char]; | |
sa[insert_index] = pre_index; | |
bucket[pre_char]++; | |
} | |
} | |
// S-type sort | |
bucket = make_bucket_back(s); | |
for (int i = sa.size() - 1; i >= 0; --i) { | |
int pre_index = sa[i] - 1; | |
// iのひとつ前のも文字がS-typeなら | |
if (sa[i] != -1 && sa[i] > 0 && is_type_s[pre_index]) { | |
char pre_char = s[pre_index]; | |
sa[--bucket[pre_char]] = pre_index; | |
} | |
} | |
} | |
void show_sa(string s, vector<int> sa) { | |
cout << "sa" << endl; | |
for (int i = 0; i < sa.size(); ++i) { | |
cout << i << "|" << sa[i] << endl; | |
} | |
} | |
}; | |
void test() { | |
SuffixArray sa; | |
// SSLSLSLSSLLS | |
vector<bool> v{ true, true, false, true, false, true, false, true, true, false, false, true }; | |
assert(sa.make_type_list("abracadabra") == v); | |
v = { false, false, true, true, false, false, true, true, false, false, true, true, false, false, false, false, true }; | |
assert(sa.make_type_list("mmiissiissiippii") == v); | |
} | |
#include <random> | |
// 長さnのランダムな文字列を作成 | |
string make_random_string(int n) { | |
random_device rnd; | |
mt19937 mt(rnd()); | |
//uniform_int_distribution<> rand100(48, 57); // 0-9 | |
//uniform_int_distribution<> rand100(65, 90); // A-Z | |
uniform_int_distribution<> rand100(97, 122); // a-z | |
string s = ""; | |
for (int i = 0; i < n; ++i) { | |
s += rand100(mt); | |
} | |
return s; | |
} | |
vector<int> raku(string s) { | |
vector<string> v; | |
for (int i = 0; i <= s.size(); ++i) { | |
string t = s.substr(i); | |
v.push_back(t); | |
} | |
sort(v.begin(), v.end()); | |
vector<int> ans; | |
for (int i = 0; i < v.size(); ++i) { | |
ans.push_back(s.size() - v[i].size()); | |
} | |
return ans; | |
} | |
void test_frame(string s, vector<int> ans) { | |
SuffixArray sa; | |
vector<int> v(s.size() + 1, -1); | |
if (sa.build_sais(v, s) != v) { | |
cout << "ng" << endl; | |
cout << s << endl; | |
for (int i = 0; i < v.size(); ++i) { | |
cout << i << ":" << ans[i] << "->" << v[i] << endl; | |
} | |
} | |
else { | |
//cout << "ok" << endl; | |
} | |
} | |
void random_test() { | |
SuffixArray sa; | |
for (int l = 2; l < 100; ++l) { | |
cout << l << endl; | |
for (int i = 0; i < 10000; ++i) { | |
string s = make_random_string(l); | |
cout << s << endl; | |
test_frame(s, raku(s)); | |
} | |
} | |
} | |
void main() { | |
/* | |
test_frame("peqejgt", {7, 3, 1, 5, 4, 0, 2, 6}); | |
test_frame("vdhehev", { 7, 1, 3, 5, 2, 4, 6, 0 }); | |
test_frame("abracadabra", { 11, 10, 7, 0, 3, 5, 8, 1, 4, 6, 9, 2 }); | |
test_frame("mississippi", { 11, 10, 7, 4, 1, 0, 9, 8, 6, 3, 5, 2 }); | |
test_frame("mmiissiissiippii", { 16, 15, 14, 10, 6, 2, 11, 7, 3, 1, 0, 13, 12, 9, 5, 8, 4 }); | |
test_frame("bddwququqx", { 10, 0, 1, 2, 4, 6, 8, 5, 7, 3, 9 }); | |
*/ | |
test_frame("kdvdvdztw", { 9, 1, 3, 5, 0, 7, 2, 4, 8, 6 }); | |
//random_test(); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment