Skip to content

Instantly share code, notes, and snippets.

@mrm1001
Last active August 11, 2018 16:28
Show Gist options
  • Save mrm1001/900d333b4b989e6727e195adae40757c to your computer and use it in GitHub Desktop.
Save mrm1001/900d333b4b989e6727e195adae40757c to your computer and use it in GitHub Desktop.
FastText
void Dictionary::initNgrams() {
for (size_t i = 0; i < size_; i++) {
std::string word = BOW + words_[i].word + EOW;
words_[i].subwords.clear();
words_[i].subwords.push_back(i);
if (words_[i].word != EOS) {
computeSubwords(word, words_[i].subwords);
}
}
}
void Dictionary::computeSubwords(const std::string& word,
std::vector<int32_t>& ngrams) const {
for (size_t i = 0; i < word.size(); i++) {
std::string ngram;
if ((word[i] & 0xC0) == 0x80) continue;
for (size_t j = i, n = 1; j < word.size() && n <= args_->maxn; n++) {
ngram.push_back(word[j++]);
while (j < word.size() && (word[j] & 0xC0) == 0x80) {
ngram.push_back(word[j++]);
}
if (n >= args_->minn && !(n == 1 && (i == 0 || j == word.size()))) {
int32_t h = hash(ngram) % args_->bucket;
pushHash(ngrams, h);
}
}
}
}
void Dictionary::pushHash(std::vector<int32_t>& hashes, int32_t id) const {
if (pruneidx_size_ == 0 || id < 0) return;
if (pruneidx_size_ > 0) {
if (pruneidx_.count(id)) {
id = pruneidx_.at(id);
} else {
return;
}
}
hashes.push_back(nwords_ + id);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment