Skip to content

Instantly share code, notes, and snippets.

@mrm1001
mrm1001 / fasttext.cc
Created August 12, 2018 16:19
FastText
void Model::computeOutputSoftmax(Vector& hidden, Vector& output) const {
if (quant_ && args_->qout) {
output.mul(*qwo_, hidden);
} else {
output.mul(*wo_, hidden);
}
real max = output[0], z = 0.0;
for (int32_t i = 0; i < osz_; i++) {
max = std::max(output[i], max);
}
@mrm1001
mrm1001 / fasttext.cc
Created August 12, 2018 16:11
FastText
void Model::computeHidden(const std::vector<int32_t>& input, Vector& hidden) const {
assert(hidden.size() == hsz_);
hidden.zero();
for (auto it = input.cbegin(); it != input.cend(); ++it) {
if(quant_) {
hidden.addRow(*qwi_, *it);
} else {
hidden.addRow(*wi_, *it);
}
}
@mrm1001
mrm1001 / fasttext.cc
Created August 12, 2018 16:02
FastText
void FastText::supervised(
Model& model,
real lr,
const std::vector<int32_t>& line,
const std::vector<int32_t>& labels) {
if (labels.size() == 0 || line.size() == 0) return;
std::uniform_int_distribution<> uniform(0, labels.size() - 1);
int32_t i = uniform(model.rng);
model.update(line, labels[i], lr);
}
@mrm1001
mrm1001 / fasttext.cc
Created August 11, 2018 16:51
FastText
void Dictionary::addWordNgrams(std::vector<int32_t>& line,
const std::vector<int32_t>& hashes,
int32_t n) const {
for (int32_t i = 0; i < hashes.size(); i++) {
uint64_t h = hashes[i];
for (int32_t j = i + 1; j < hashes.size() && j < i + n; j++) {
h = h * 116049371 + hashes[j];
pushHash(line, h % args_->bucket);
}
}
@mrm1001
mrm1001 / fasttext.cc
Created August 11, 2018 16:46
FastText
int32_t Dictionary::getLine(std::istream& in,
std::vector<int32_t>& words,
std::vector<int32_t>& labels) const {
std::vector<int32_t> word_hashes;
std::string token;
int32_t ntokens = 0;
reset(in);
words.clear();
labels.clear();
@mrm1001
mrm1001 / fasttext.cc
Created August 11, 2018 16:40
FastText
void FastText::trainThread(int32_t threadId) {
std::ifstream ifs(args_->input);
utils::seek(ifs, threadId * utils::size(ifs) / args_->thread);
Model model(input_, output_, args_, threadId);
if (args_->model == model_name::sup) {
model.setTargetCounts(dict_->getCounts(entry_type::label));
} else {
model.setTargetCounts(dict_->getCounts(entry_type::word));
}
@mrm1001
mrm1001 / fasttext.cc
Created August 11, 2018 16:31
FastText
if (args_->pretrainedVectors.size() != 0) {
loadVectors(args_->pretrainedVectors);
} else {
input_ = std::make_shared<Matrix>(dict_->nwords()+args_->bucket, args_->dim);
input_->uniform(1.0 / args_->dim);
}
if (args_->model == model_name::sup) {
output_ = std::make_shared<Matrix>(dict_->nlabels(), args_->dim);
} else {
@mrm1001
mrm1001 / fasttext.cc
Last active August 11, 2018 16:28
FastText
void Dictionary::initNgrams() {
for (size_t i = 0; i < size_; i++) {
std::string word = BOW + words_[i].word + EOW;
words_[i].subwords.clear();
words_[i].subwords.push_back(i);
if (words_[i].word != EOS) {
computeSubwords(word, words_[i].subwords);
}
}
}
@mrm1001
mrm1001 / fasttext.cc
Created August 11, 2018 15:45
FastText
struct entry {
std::string word;
int64_t count;
entry_type type;
std::vector<int32_t> subwords;
};
@mrm1001
mrm1001 / fasttext.cc
Created August 11, 2018 15:42
FastText
void Dictionary::readFromFile(std::istream& in) {
std::string word;
int64_t minThreshold = 1;
while (readWord(in, word)) {
add(word);
if (ntokens_ % 1000000 == 0 && args_->verbose > 1) {
std::cerr << "\rRead " << ntokens_ / 1000000 << "M words" << std::flush;
}
if (size_ > 0.75 * MAX_VOCAB_SIZE) {
minThreshold++;