Skip to content

Instantly share code, notes, and snippets.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h1>1. Creating an RDD</h1>"
]
},
{
@mrm1001
mrm1001 / es_features.py
Created August 3, 2016 09:00 — forked from konradkonrad/es_features.py
tfidf from elasticsearch
import elasticsearch
from math import log
def tfidf_matrix(es, index, doc_type, fields, size=10, bulk=500, query=dict(match_all=[])):
"""Generate tfidf for `size` documents of `index`/`doc_type`.
All `fields` need to have the mapping "term_vector": "yes".
This is the consuming version (i.e. get everything at once).
:param es: elasticsearch client
@mrm1001
mrm1001 / train.cc
Created August 8, 2018 21:37
FastText input
if (args_->pretrainedVectors.size() != 0) {
loadVectors(args_->pretrainedVectors);
} else {
input_ = std::make_shared<Matrix>(dict_->nwords()+args_->bucket, args_->dim);
input_->uniform(1.0 / args_->dim);
}
@mrm1001
mrm1001 / dict.cc
Last active August 12, 2018 20:38
Fasttext - creating dictionary
int32_t Dictionary::find(const std::string& w, uint32_t h) const {
int32_t word2intsize = word2int_.size();
int32_t id = h % word2intsize;
while (word2int_[id] != -1 && words_[word2int_[id]].word != w) {
id = (id + 1) % word2intsize;
}
return id;
}
int32_t Dictionary::find(const std::string& w) const {
@mrm1001
mrm1001 / train.cc
Created August 11, 2018 15:21
FastText - the main train function
void FastText::train(const Args args) {
args_ = std::make_shared<Args>(args);
dict_ = std::make_shared<Dictionary>(args_);
if (args_->input == "-") {
// manage expectations
throw std::invalid_argument("Cannot use stdin for training!");
}
std::ifstream ifs(args_->input);
if (!ifs.is_open()) {
throw std::invalid_argument(
@mrm1001
mrm1001 / fasttext.cc
Created August 11, 2018 15:28
FasText
Dictionary::Dictionary(std::shared_ptr<Args> args) : args_(args),
word2int_(MAX_VOCAB_SIZE, -1), size_(0), nwords_(0), nlabels_(0),
ntokens_(0), pruneidx_size_(-1) {}
@mrm1001
mrm1001 / fasttext.cc
Created August 11, 2018 15:42
FastText
void Dictionary::readFromFile(std::istream& in) {
std::string word;
int64_t minThreshold = 1;
while (readWord(in, word)) {
add(word);
if (ntokens_ % 1000000 == 0 && args_->verbose > 1) {
std::cerr << "\rRead " << ntokens_ / 1000000 << "M words" << std::flush;
}
if (size_ > 0.75 * MAX_VOCAB_SIZE) {
minThreshold++;
@mrm1001
mrm1001 / fasttext.cc
Created August 11, 2018 15:45
FastText
struct entry {
std::string word;
int64_t count;
entry_type type;
std::vector<int32_t> subwords;
};
@mrm1001
mrm1001 / fasttext.cc
Last active August 11, 2018 16:28
FastText
void Dictionary::initNgrams() {
for (size_t i = 0; i < size_; i++) {
std::string word = BOW + words_[i].word + EOW;
words_[i].subwords.clear();
words_[i].subwords.push_back(i);
if (words_[i].word != EOS) {
computeSubwords(word, words_[i].subwords);
}
}
}
@mrm1001
mrm1001 / fasttext.cc
Created August 11, 2018 16:31
FastText
if (args_->pretrainedVectors.size() != 0) {
loadVectors(args_->pretrainedVectors);
} else {
input_ = std::make_shared<Matrix>(dict_->nwords()+args_->bucket, args_->dim);
input_->uniform(1.0 / args_->dim);
}
if (args_->model == model_name::sup) {
output_ = std::make_shared<Matrix>(dict_->nlabels(), args_->dim);
} else {