Skip to content

Instantly share code, notes, and snippets.

@takei-yuya
Last active May 12, 2020 10:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save takei-yuya/5f646c5aa7f43f708563844b931631ae to your computer and use it in GitHub Desktop.
Save takei-yuya/5f646c5aa7f43f708563844b931631ae to your computer and use it in GitHub Desktop.
const char kSeparatorChar = '$'; // 区切り文字もしくは終端文字
size_t CommonPrefixLength(const std::string& lhs, const std::string& rhs) {
size_t min_size = std::min(lhs.size(), rhs.size());
for (size_t i = 0; i < min_size; ++i) {
if (lhs[i] == kSeparatorChar) return i;
if (rhs[i] == kSeparatorChar) return i;
if (lhs[i] != rhs[i]) return i;
}
return min_size;
}
// Longest Common Substring (not Subsequence)
class LCS {
public:
LCS() {
}
void AddText(const std::string& data) {
offsets_.push_back(concat_docs_.size());
concat_docs_ += data + kSeparatorChar;
}
std::string Solve(size_t k = 0) const {
if (k == 0) k = offsets_.size(); // Kの指定がなければ全データで共通の部分文字列を探す
if (k > offsets_.size()) k = offsets_.size();
std::string lcs;
std::vector<size_t> suffix_array = NaiveSuffixArray();
SlidingWindow sliding_window(k);
std::string prev_string = "";
for (size_t i = 0; i < suffix_array.size(); ++i) {
uint64_t pos = suffix_array[i];
std::string cur_string = concat_docs_.substr(pos);
size_t prefix_len = CommonPrefixLength(prev_string, cur_string);
size_t common_len = sliding_window.Add(prefix_len, GetDocID(pos)); // LCPとdocidをSlidingWindowに渡す
if (common_len != 0) { // 制約条件を満たした
if (common_len > lcs.size()) lcs = cur_string.substr(0, common_len);
}
prev_string.swap(cur_string);
}
return lcs;
}
private:
std::vector<size_t> NaiveSuffixArray() const {
// 簡易SuffixArray
std::vector<size_t> result(concat_docs_.size());
std::iota(result.begin(), result.end(), 0);
std::sort(result.begin(), result.end(),
[this](size_t lhs, size_t rhs){ return concat_docs_.substr(lhs) < concat_docs_.substr(rhs); });
return result;
}
size_t GetDocID(size_t pos) const {
size_t docid = 0;
for (docid = 0; docid < offsets_.size(); ++docid) {
if (pos < offsets_[docid]) break;
}
return docid - 1;
}
private:
std::string concat_docs_;
std::vector<size_t> offsets_;
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment