This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void Model::computeOutputSoftmax(Vector& hidden, Vector& output) const { | |
if (quant_ && args_->qout) { | |
output.mul(*qwo_, hidden); | |
} else { | |
output.mul(*wo_, hidden); | |
} | |
real max = output[0], z = 0.0; | |
for (int32_t i = 0; i < osz_; i++) { | |
max = std::max(output[i], max); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void Model::computeHidden(const std::vector<int32_t>& input, Vector& hidden) const { | |
assert(hidden.size() == hsz_); | |
hidden.zero(); | |
for (auto it = input.cbegin(); it != input.cend(); ++it) { | |
if(quant_) { | |
hidden.addRow(*qwi_, *it); | |
} else { | |
hidden.addRow(*wi_, *it); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void FastText::supervised( | |
Model& model, | |
real lr, | |
const std::vector<int32_t>& line, | |
const std::vector<int32_t>& labels) { | |
if (labels.size() == 0 || line.size() == 0) return; | |
std::uniform_int_distribution<> uniform(0, labels.size() - 1); | |
int32_t i = uniform(model.rng); | |
model.update(line, labels[i], lr); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void Dictionary::addWordNgrams(std::vector<int32_t>& line, | |
const std::vector<int32_t>& hashes, | |
int32_t n) const { | |
for (int32_t i = 0; i < hashes.size(); i++) { | |
uint64_t h = hashes[i]; | |
for (int32_t j = i + 1; j < hashes.size() && j < i + n; j++) { | |
h = h * 116049371 + hashes[j]; | |
pushHash(line, h % args_->bucket); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
int32_t Dictionary::getLine(std::istream& in, | |
std::vector<int32_t>& words, | |
std::vector<int32_t>& labels) const { | |
std::vector<int32_t> word_hashes; | |
std::string token; | |
int32_t ntokens = 0; | |
reset(in); | |
words.clear(); | |
labels.clear(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void FastText::trainThread(int32_t threadId) { | |
std::ifstream ifs(args_->input); | |
utils::seek(ifs, threadId * utils::size(ifs) / args_->thread); | |
Model model(input_, output_, args_, threadId); | |
if (args_->model == model_name::sup) { | |
model.setTargetCounts(dict_->getCounts(entry_type::label)); | |
} else { | |
model.setTargetCounts(dict_->getCounts(entry_type::word)); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
if (args_->pretrainedVectors.size() != 0) { | |
loadVectors(args_->pretrainedVectors); | |
} else { | |
input_ = std::make_shared<Matrix>(dict_->nwords()+args_->bucket, args_->dim); | |
input_->uniform(1.0 / args_->dim); | |
} | |
if (args_->model == model_name::sup) { | |
output_ = std::make_shared<Matrix>(dict_->nlabels(), args_->dim); | |
} else { |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void Dictionary::initNgrams() { | |
for (size_t i = 0; i < size_; i++) { | |
std::string word = BOW + words_[i].word + EOW; | |
words_[i].subwords.clear(); | |
words_[i].subwords.push_back(i); | |
if (words_[i].word != EOS) { | |
computeSubwords(word, words_[i].subwords); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
struct entry { | |
std::string word; | |
int64_t count; | |
entry_type type; | |
std::vector<int32_t> subwords; | |
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void Dictionary::readFromFile(std::istream& in) { | |
std::string word; | |
int64_t minThreshold = 1; | |
while (readWord(in, word)) { | |
add(word); | |
if (ntokens_ % 1000000 == 0 && args_->verbose > 1) { | |
std::cerr << "\rRead " << ntokens_ / 1000000 << "M words" << std::flush; | |
} | |
if (size_ > 0.75 * MAX_VOCAB_SIZE) { | |
minThreshold++; |
NewerOlder