Skip to content

Instantly share code, notes, and snippets.

@MitI-7
Last active September 7, 2016 13:43
Show Gist options
  • Save MitI-7/baaa195c4cde4156db5ce27c4ac733f3 to your computer and use it in GitHub Desktop.
Save MitI-7/baaa195c4cde4156db5ce27c4ac733f3 to your computer and use it in GitHub Desktop.
#include <iostream>
#include <vector>
#include <string>
#include <set>
#include <algorithm>
#include <map>
#include <vector>
#include <string>
#include <set>
#include <algorithm>
#include <assert.h>
using namespace std;
class SuffixArray {
public:
// 接尾辞配列を作成
vector<int> build_sais(vector<int> &sa, const string &s) {
assert(sa.size() >= s.size() + 1);
assert(s.size() > 1);
for (int i = 0; i < sa.size(); ++i) {
sa[i] = -1;
}
// 1. LMS-substringsを求める.
const vector<bool> is_type_s = this->make_type_list(s);
vector<int> bucket = this->make_bucket_back(s);
// LMSの開始indexを各バケツの後ろから詰め込む
// LMSはS-type
sa[0] = s.size(); // 終端文字
for (int i = 0; i < s.size(); ++i) {
if (is_lms_char(i, is_type_s)) {
sa[--bucket[s[i]]] = i;
}
}
this->induced_sort(s, sa, is_type_s);
// LMS-substringの元の数値列における位置を辞書順に左詰めで格納する
int num_lms = 0;
for (int i = 0; i < sa.size(); ++i) {
if (is_lms_char(sa[i], is_type_s)) {
sa[num_lms++] = sa[i];
}
}
// 辞書順以外を一旦クリア
for (int i = num_lms; i < sa.size(); ++i) {
sa[i] = -1;
}
// LMS-substringのidを元の数値列における出現順で空いているところに格納する
sa[num_lms + sa[0] / 2] = 0;
for (int i = 1; i < num_lms; ++i) {
int p = sa[i - 1];
int q = sa[i];
bool same = i != 1; // iが1のときは番兵なので必ず異なる
for (int w = 0; same; w++) {
if (s[p + w] != s[q + w]) {
same = false;
}
bool p_is_lmc = is_lms_char(p + w, is_type_s);
bool q_is_lmc = is_lms_char(q + w, is_type_s);
// pもqもLなら続ける
if (!p_is_lmc && !q_is_lmc) {
continue;
}
// pもqもSならおわり
if (p_is_lmc && q_is_lmc && w != 0) {
break;
}
}
if (same) {
sa[num_lms + q / 2] = sa[num_lms + p / 2];
}
else {
sa[num_lms + q / 2] = sa[num_lms + p / 2] + 1;
}
}
// LMS-substringのidを元の数値列における出現順で右詰で格納する
int next_alphabet_size = sa[num_lms + sa[num_lms - 1] / 2] + 1;
for (int i = sa.size() - 1, p = sa.size() - 1; i >= num_lms; i--) {
if (sa[i] != -1) {
sa[p--] = sa[i];
}
}
// 同じLMS-substringがある場合は再帰的に順序を求めるa
if (next_alphabet_size != num_lms) {
string s1 = "";
for (int i = sa.size() - num_lms; i < sa.size(); ++i) {
s1 += to_string(sa[i]);
}
build_sais(sa, s1);
// 右づめにする
for (int i = 0; i < num_lms; i++) {
sa[sa.size() - num_lms + sa[i + 1]] = i;
}
int p = 0;
for (int i = 0; i < sa.size(); ++i) {
if (is_lms_char(i, is_type_s)) {
int ax = sa[sa.size() - num_lms + p];
sa[ax] = i;
p++;
}
}
}
// 辞書順以外を一旦クリア
for (int i = num_lms; i < sa.size(); ++i) {
sa[i] = -1;
}
// 配列の末尾から正規の位置に詰めなおす
bucket = make_bucket_back(s);
for (int i = num_lms - 1; i >= 0; --i) {
int c = s[sa[i]];
bucket[c]--;
swap(sa[bucket[c]], sa[i]);
}
this->induced_sort(s, sa, is_type_s);
return sa;
}
// 各文字について,L-typeかS-typeか求める
vector<bool> make_type_list(const string &s) {
vector<bool> is_type_s(s.size() + 1, true);
for (int i = s.size() - 1; i >= 0; --i) {
if (s[i] == s[i + 1]) {
is_type_s[i] = is_type_s[i + 1];
}
else {
is_type_s[i] = s[i] < s[i + 1];
}
}
return is_type_s;
}
vector<int> make_bucket_back(const string &s) {
vector<int> bucket(256);
bucket[0] = 1;
for (char c : s) {
bucket[c]++;
}
for (int i = 0; i < bucket.size() - 1; ++i) {
bucket[i + 1] += bucket[i];
}
return bucket;
}
vector<int> make_bucket_front(const string &s) {
vector<int> bucket(256);
bucket[0] = 1;
for (char c : s) {
bucket[c]++;
}
vector<int> rtn(256);
for (int i = 0; i < bucket.size() - 1; ++i) {
rtn[i + 1] = rtn[i] + bucket[i];
}
return rtn;
}
bool is_lms_char(int i, const vector<bool> &is_s) {
// 終端文字はS
if (i == is_s.size()) {
return true;
}
if (0 < i && i < is_s.size()) {
return is_s[i] && !is_s[i - 1];
}
return false;
}
void induced_sort(string s, vector<int> &sa, const vector<bool> &is_type_s) {
vector<int> bucket;
// L-type sort
bucket = make_bucket_front(s);
for (int i = 0; i < sa.size(); ++i) {
int pre_index = sa[i] - 1;
// iのひとつ前の文字がL-typeなら
if (sa[i] != -1 && sa[i] > 0 && !is_type_s[pre_index]) {
char pre_char = s[pre_index];
int insert_index = bucket[pre_char];
sa[insert_index] = pre_index;
bucket[pre_char]++;
}
}
// S-type sort
bucket = make_bucket_back(s);
for (int i = sa.size() - 1; i >= 0; --i) {
int pre_index = sa[i] - 1;
// iのひとつ前のも文字がS-typeなら
if (sa[i] != -1 && sa[i] > 0 && is_type_s[pre_index]) {
char pre_char = s[pre_index];
sa[--bucket[pre_char]] = pre_index;
}
}
}
void show_sa(string s, vector<int> sa) {
cout << "sa" << endl;
for (int i = 0; i < sa.size(); ++i) {
cout << i << "|" << sa[i] << endl;
}
}
};
void test() {
SuffixArray sa;
// SSLSLSLSSLLS
vector<bool> v{ true, true, false, true, false, true, false, true, true, false, false, true };
assert(sa.make_type_list("abracadabra") == v);
v = { false, false, true, true, false, false, true, true, false, false, true, true, false, false, false, false, true };
assert(sa.make_type_list("mmiissiissiippii") == v);
}
#include <random>
// 長さnのランダムな文字列を作成
string make_random_string(int n) {
random_device rnd;
mt19937 mt(rnd());
//uniform_int_distribution<> rand100(48, 57); // 0-9
//uniform_int_distribution<> rand100(65, 90); // A-Z
uniform_int_distribution<> rand100(97, 122); // a-z
string s = "";
for (int i = 0; i < n; ++i) {
s += rand100(mt);
}
return s;
}
vector<int> raku(string s) {
vector<string> v;
for (int i = 0; i <= s.size(); ++i) {
string t = s.substr(i);
v.push_back(t);
}
sort(v.begin(), v.end());
vector<int> ans;
for (int i = 0; i < v.size(); ++i) {
ans.push_back(s.size() - v[i].size());
}
return ans;
}
void test_frame(string s, vector<int> ans) {
SuffixArray sa;
vector<int> v(s.size() + 1, -1);
if (sa.build_sais(v, s) != v) {
cout << "ng" << endl;
cout << s << endl;
for (int i = 0; i < v.size(); ++i) {
cout << i << ":" << ans[i] << "->" << v[i] << endl;
}
}
else {
//cout << "ok" << endl;
}
}
void random_test() {
SuffixArray sa;
for (int l = 2; l < 100; ++l) {
cout << l << endl;
for (int i = 0; i < 10000; ++i) {
string s = make_random_string(l);
cout << s << endl;
test_frame(s, raku(s));
}
}
}
void main() {
/*
test_frame("peqejgt", {7, 3, 1, 5, 4, 0, 2, 6});
test_frame("vdhehev", { 7, 1, 3, 5, 2, 4, 6, 0 });
test_frame("abracadabra", { 11, 10, 7, 0, 3, 5, 8, 1, 4, 6, 9, 2 });
test_frame("mississippi", { 11, 10, 7, 4, 1, 0, 9, 8, 6, 3, 5, 2 });
test_frame("mmiissiissiippii", { 16, 15, 14, 10, 6, 2, 11, 7, 3, 1, 0, 13, 12, 9, 5, 8, 4 });
test_frame("bddwququqx", { 10, 0, 1, 2, 4, 6, 8, 5, 7, 3, 9 });
*/
test_frame("kdvdvdztw", { 9, 1, 3, 5, 0, 7, 2, 4, 8, 6 });
//random_test();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment