mrm1001

## gist:94f2a09ef090f6132f62
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h1>1. Creating an RDD</h1>"
   ]
  },
  {

## es_features.py
import elasticsearch
from math import log


def tfidf_matrix(es, index, doc_type, fields, size=10, bulk=500, query=dict(match_all=[])):
    """Generate tfidf for `size` documents of `index`/`doc_type`.
    All `fields` need to have the mapping "term_vector": "yes".
    This is the consuming version (i.e. get everything at once).

    :param es: elasticsearch client

## train.cc
  if (args_->pretrainedVectors.size() != 0) {
    loadVectors(args_->pretrainedVectors);
  } else {
    input_ = std::make_shared<Matrix>(dict_->nwords()+args_->bucket, args_->dim);
    input_->uniform(1.0 / args_->dim);
  }

## dict.cc
int32_t Dictionary::find(const std::string& w, uint32_t h) const {
  int32_t word2intsize = word2int_.size();
  int32_t id = h % word2intsize;
  while (word2int_[id] != -1 && words_[word2int_[id]].word != w) {
    id = (id + 1) % word2intsize;
  }
  return id;
}

int32_t Dictionary::find(const std::string& w) const {

## train.cc
void FastText::train(const Args args) {
  args_ = std::make_shared<Args>(args);
  dict_ = std::make_shared<Dictionary>(args_);
  if (args_->input == "-") {
    // manage expectations
    throw std::invalid_argument("Cannot use stdin for training!");
  }
  std::ifstream ifs(args_->input);
  if (!ifs.is_open()) {
    throw std::invalid_argument(

## fasttext.cc
Dictionary::Dictionary(std::shared_ptr<Args> args) : args_(args),
  word2int_(MAX_VOCAB_SIZE, -1), size_(0), nwords_(0), nlabels_(0),
  ntokens_(0), pruneidx_size_(-1) {}

## fasttext.cc
void Dictionary::readFromFile(std::istream& in) {
  std::string word;
  int64_t minThreshold = 1;
  while (readWord(in, word)) {
    add(word);
    if (ntokens_ % 1000000 == 0 && args_->verbose > 1) {
      std::cerr << "\rRead " << ntokens_  / 1000000 << "M words" << std::flush;
    }
    if (size_ > 0.75 * MAX_VOCAB_SIZE) {
      minThreshold++;

## fasttext.cc
struct entry {
  std::string word;
  int64_t count;
  entry_type type;
  std::vector<int32_t> subwords;
};

## fasttext.cc
void Dictionary::initNgrams() {
  for (size_t i = 0; i < size_; i++) {
    std::string word = BOW + words_[i].word + EOW;
    words_[i].subwords.clear();
    words_[i].subwords.push_back(i);
    if (words_[i].word != EOS) {
      computeSubwords(word, words_[i].subwords);
    }
  }
}

## fasttext.cc
  if (args_->pretrainedVectors.size() != 0) {
    loadVectors(args_->pretrainedVectors);
  } else {
    input_ = std::make_shared<Matrix>(dict_->nwords()+args_->bucket, args_->dim);
    input_->uniform(1.0 / args_->dim);
  }

  if (args_->model == model_name::sup) {
    output_ = std::make_shared<Matrix>(dict_->nlabels(), args_->dim);
  } else {
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"<h1>1. Creating an RDD</h1>"
	]
	},
	{
	import elasticsearch
	from math import log


	def tfidf_matrix(es, index, doc_type, fields, size=10, bulk=500, query=dict(match_all=[])):
	"""Generate tfidf for `size` documents of `index`/`doc_type`.
	All `fields` need to have the mapping "term_vector": "yes".
	This is the consuming version (i.e. get everything at once).

	:param es: elasticsearch client
	if (args_->pretrainedVectors.size() != 0) {
	loadVectors(args_->pretrainedVectors);
	} else {
	input_ = std::make_shared<Matrix>(dict_->nwords()+args_->bucket, args_->dim);
	input_->uniform(1.0 / args_->dim);
	}
	int32_t Dictionary::find(const std::string& w, uint32_t h) const {
	int32_t word2intsize = word2int_.size();
	int32_t id = h % word2intsize;
	while (word2int_[id] != -1 && words_[word2int_[id]].word != w) {
	id = (id + 1) % word2intsize;
	}
	return id;
	}

	int32_t Dictionary::find(const std::string& w) const {
	void FastText::train(const Args args) {
	args_ = std::make_shared<Args>(args);
	dict_ = std::make_shared<Dictionary>(args_);
	if (args_->input == "-") {
	// manage expectations
	throw std::invalid_argument("Cannot use stdin for training!");
	}
	std::ifstream ifs(args_->input);
	if (!ifs.is_open()) {
	throw std::invalid_argument(
	Dictionary::Dictionary(std::shared_ptr<Args> args) : args_(args),
	word2int_(MAX_VOCAB_SIZE, -1), size_(0), nwords_(0), nlabels_(0),
	ntokens_(0), pruneidx_size_(-1) {}
	void Dictionary::readFromFile(std::istream& in) {
	std::string word;
	int64_t minThreshold = 1;
	while (readWord(in, word)) {
	add(word);
	if (ntokens_ % 1000000 == 0 && args_->verbose > 1) {
	std::cerr << "\rRead " << ntokens_ / 1000000 << "M words" << std::flush;
	}
	if (size_ > 0.75 * MAX_VOCAB_SIZE) {
	minThreshold++;
	struct entry {
	std::string word;
	int64_t count;
	entry_type type;
	std::vector<int32_t> subwords;
	};
	void Dictionary::initNgrams() {
	for (size_t i = 0; i < size_; i++) {
	std::string word = BOW + words_[i].word + EOW;
	words_[i].subwords.clear();
	words_[i].subwords.push_back(i);
	if (words_[i].word != EOS) {
	computeSubwords(word, words_[i].subwords);
	}
	}
	}