Skip to content

Instantly share code, notes, and snippets.

@Orbifold
Created December 12, 2016 06:52
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save Orbifold/555a73fe0350d6e1ca675975c3b67ca1 to your computer and use it in GitHub Desktop.
Save Orbifold/555a73fe0350d6e1ca675975c3b67ca1 to your computer and use it in GitHub Desktop.
Using Word2Vec experiment on the Bible.
from gensim.utils import simple_preprocess
tokenize = lambda x: simple_preprocess(x)
# tokenize("We can load the vocabulary from the JSON file, and generate a reverse mapping (from index to word, so that we can decode an encoded string if we want)?!")
import os
import json
import numpy as np
from gensim.models import Word2Vec
def create_embeddings(data_dir, embeddings_path, vocab_path, **params):
class SentenceGenerator(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for fname in os.listdir(self.dirname):
for line in open(os.path.join(self.dirname, fname), encoding='utf-8', errors='ignore'):
line = line.encode('utf-8').strip()
yield tokenize(line)
sentences = SentenceGenerator(data_dir)
model = Word2Vec(sentences, **params)
weights = model.syn0
np.save(open(embeddings_path, 'wb'), weights)
vocab = dict([(k, v.index) for k, v in model.vocab.items()])
with open(vocab_path, 'w') as f:
f.write(json.dumps(vocab))
return model
model = create_embeddings("~/Desktop/AIML/Datasets/Bible/", "~/Desktop/AIML/Datasets/Bible/embeddings.txt", "~/Desktop/AIML/Datasets/Bible/Vocab.txt")
model.most_similar(positive=['woman', 'king'], negative=['man'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment