-
-
Save takei-yuya/5f646c5aa7f43f708563844b931631ae to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const char kSeparatorChar = '$'; // 区切り文字もしくは終端文字 | |
size_t CommonPrefixLength(const std::string& lhs, const std::string& rhs) { | |
size_t min_size = std::min(lhs.size(), rhs.size()); | |
for (size_t i = 0; i < min_size; ++i) { | |
if (lhs[i] == kSeparatorChar) return i; | |
if (rhs[i] == kSeparatorChar) return i; | |
if (lhs[i] != rhs[i]) return i; | |
} | |
return min_size; | |
} | |
// Longest Common Substring (not Subsequence) | |
class LCS { | |
public: | |
LCS() { | |
} | |
void AddText(const std::string& data) { | |
offsets_.push_back(concat_docs_.size()); | |
concat_docs_ += data + kSeparatorChar; | |
} | |
std::string Solve(size_t k = 0) const { | |
if (k == 0) k = offsets_.size(); // Kの指定がなければ全データで共通の部分文字列を探す | |
if (k > offsets_.size()) k = offsets_.size(); | |
std::string lcs; | |
std::vector<size_t> suffix_array = NaiveSuffixArray(); | |
SlidingWindow sliding_window(k); | |
std::string prev_string = ""; | |
for (size_t i = 0; i < suffix_array.size(); ++i) { | |
uint64_t pos = suffix_array[i]; | |
std::string cur_string = concat_docs_.substr(pos); | |
size_t prefix_len = CommonPrefixLength(prev_string, cur_string); | |
size_t common_len = sliding_window.Add(prefix_len, GetDocID(pos)); // LCPとdocidをSlidingWindowに渡す | |
if (common_len != 0) { // 制約条件を満たした | |
if (common_len > lcs.size()) lcs = cur_string.substr(0, common_len); | |
} | |
prev_string.swap(cur_string); | |
} | |
return lcs; | |
} | |
private: | |
std::vector<size_t> NaiveSuffixArray() const { | |
// 簡易SuffixArray | |
std::vector<size_t> result(concat_docs_.size()); | |
std::iota(result.begin(), result.end(), 0); | |
std::sort(result.begin(), result.end(), | |
[this](size_t lhs, size_t rhs){ return concat_docs_.substr(lhs) < concat_docs_.substr(rhs); }); | |
return result; | |
} | |
size_t GetDocID(size_t pos) const { | |
size_t docid = 0; | |
for (docid = 0; docid < offsets_.size(); ++docid) { | |
if (pos < offsets_[docid]) break; | |
} | |
return docid - 1; | |
} | |
private: | |
std::string concat_docs_; | |
std::vector<size_t> offsets_; | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment