Skip to content

Instantly share code, notes, and snippets.

@mrm1001
mrm1001 / fasttext.cc
Created August 11, 2018 15:28
FasText
Dictionary::Dictionary(std::shared_ptr<Args> args) : args_(args),
word2int_(MAX_VOCAB_SIZE, -1), size_(0), nwords_(0), nlabels_(0),
ntokens_(0), pruneidx_size_(-1) {}
@mrm1001
mrm1001 / train.cc
Created August 11, 2018 15:21
FastText - the main train function
void FastText::train(const Args args) {
args_ = std::make_shared<Args>(args);
dict_ = std::make_shared<Dictionary>(args_);
if (args_->input == "-") {
// manage expectations
throw std::invalid_argument("Cannot use stdin for training!");
}
std::ifstream ifs(args_->input);
if (!ifs.is_open()) {
throw std::invalid_argument(
@mrm1001
mrm1001 / dict.cc
Last active August 12, 2018 20:38
Fasttext - creating dictionary
int32_t Dictionary::find(const std::string& w, uint32_t h) const {
int32_t word2intsize = word2int_.size();
int32_t id = h % word2intsize;
while (word2int_[id] != -1 && words_[word2int_[id]].word != w) {
id = (id + 1) % word2intsize;
}
return id;
}
int32_t Dictionary::find(const std::string& w) const {
@mrm1001
mrm1001 / train.cc
Created August 8, 2018 21:37
FastText input
if (args_->pretrainedVectors.size() != 0) {
loadVectors(args_->pretrainedVectors);
} else {
input_ = std::make_shared<Matrix>(dict_->nwords()+args_->bucket, args_->dim);
input_->uniform(1.0 / args_->dim);
}
@mrm1001
mrm1001 / es_features.py
Created August 3, 2016 09:00 — forked from konradkonrad/es_features.py
tfidf from elasticsearch
import elasticsearch
from math import log
def tfidf_matrix(es, index, doc_type, fields, size=10, bulk=500, query=dict(match_all=[])):
"""Generate tfidf for `size` documents of `index`/`doc_type`.
All `fields` need to have the mapping "term_vector": "yes".
This is the consuming version (i.e. get everything at once).
:param es: elasticsearch client
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h1>1. Creating an RDD</h1>"
]
},
{