Skip to content

Instantly share code, notes, and snippets.

@mrm1001
Created August 11, 2018 16:46
Show Gist options
  • Save mrm1001/8754f4d0d2c34be8278dbc28223e7a74 to your computer and use it in GitHub Desktop.
Save mrm1001/8754f4d0d2c34be8278dbc28223e7a74 to your computer and use it in GitHub Desktop.
FastText
int32_t Dictionary::getLine(std::istream& in,
std::vector<int32_t>& words,
std::vector<int32_t>& labels) const {
std::vector<int32_t> word_hashes;
std::string token;
int32_t ntokens = 0;
reset(in);
words.clear();
labels.clear();
while (readWord(in, token)) {
uint32_t h = hash(token);
int32_t wid = getId(token, h);
entry_type type = wid < 0 ? getType(token) : getType(wid);
ntokens++;
if (type == entry_type::word) {
addSubwords(words, token, wid);
word_hashes.push_back(h);
} else if (type == entry_type::label && wid >= 0) {
labels.push_back(wid - nwords_);
}
if (token == EOS) break;
}
addWordNgrams(words, word_hashes, args_->wordNgrams);
return ntokens;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment