xeoncross/Gensim.py

## Gensim.py
from gensim.utils import simple_preprocess

tokenize = lambda x: simple_preprocess(x)
# tokenize("We can load the vocabulary from the JSON file, and generate a reverse mapping (from index to word, so that we can decode an encoded string if we want)?!")

import os
import json
import numpy as np
from gensim.models import Word2Vec

def create_embeddings(data_dir, embeddings_path, vocab_path, **params):
    class SentenceGenerator(object):
        def __init__(self, dirname):
            self.dirname = dirname

        def __iter__(self):
            for fname in os.listdir(self.dirname):
                for line in open(os.path.join(self.dirname, fname), encoding='utf-8', errors='ignore'):
                    line = line.encode('utf-8').strip()
                    yield tokenize(line)

    sentences = SentenceGenerator(data_dir)

    model = Word2Vec(sentences, **params)
    weights = model.syn0
    np.save(open(embeddings_path, 'wb'), weights)

    vocab = dict([(k, v.index) for k, v in model.vocab.items()])
    with open(vocab_path, 'w') as f:
        f.write(json.dumps(vocab))
    return model

model = create_embeddings("~/Desktop/AIML/Datasets/Bible/", "~/Desktop/AIML/Datasets/Bible/embeddings.txt", "~/Desktop/AIML/Datasets/Bible/Vocab.txt")

model.most_similar(positive=['woman', 'king'], negative=['man'])
	from gensim.utils import simple_preprocess

	tokenize = lambda x: simple_preprocess(x)
	# tokenize("We can load the vocabulary from the JSON file, and generate a reverse mapping (from index to word, so that we can decode an encoded string if we want)?!")

	import os
	import json
	import numpy as np
	from gensim.models import Word2Vec

	def create_embeddings(data_dir, embeddings_path, vocab_path, **params):
	class SentenceGenerator(object):
	def __init__(self, dirname):
	self.dirname = dirname

	def __iter__(self):
	for fname in os.listdir(self.dirname):
	for line in open(os.path.join(self.dirname, fname), encoding='utf-8', errors='ignore'):
	line = line.encode('utf-8').strip()
	yield tokenize(line)

	sentences = SentenceGenerator(data_dir)

	model = Word2Vec(sentences, **params)
	weights = model.syn0
	np.save(open(embeddings_path, 'wb'), weights)

	vocab = dict([(k, v.index) for k, v in model.vocab.items()])
	with open(vocab_path, 'w') as f:
	f.write(json.dumps(vocab))
	return model

	model = create_embeddings("~/Desktop/AIML/Datasets/Bible/", "~/Desktop/AIML/Datasets/Bible/embeddings.txt", "~/Desktop/AIML/Datasets/Bible/Vocab.txt")

	model.most_similar(positive=['woman', 'king'], negative=['man'])