Skip to content

Instantly share code, notes, and snippets.

@xeoncross
Forked from Orbifold/Gensim.py
Created July 11, 2017 16:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xeoncross/d573acb857b1e2a180cb3a96fe045898 to your computer and use it in GitHub Desktop.
Save xeoncross/d573acb857b1e2a180cb3a96fe045898 to your computer and use it in GitHub Desktop.
Using Word2Vec experiment on the Bible.
from gensim.utils import simple_preprocess
tokenize = lambda x: simple_preprocess(x)
# tokenize("We can load the vocabulary from the JSON file, and generate a reverse mapping (from index to word, so that we can decode an encoded string if we want)?!")
import os
import json
import numpy as np
from gensim.models import Word2Vec
def create_embeddings(data_dir, embeddings_path, vocab_path, **params):
class SentenceGenerator(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for fname in os.listdir(self.dirname):
for line in open(os.path.join(self.dirname, fname), encoding='utf-8', errors='ignore'):
line = line.encode('utf-8').strip()
yield tokenize(line)
sentences = SentenceGenerator(data_dir)
model = Word2Vec(sentences, **params)
weights = model.syn0
np.save(open(embeddings_path, 'wb'), weights)
vocab = dict([(k, v.index) for k, v in model.vocab.items()])
with open(vocab_path, 'w') as f:
f.write(json.dumps(vocab))
return model
model = create_embeddings("~/Desktop/AIML/Datasets/Bible/", "~/Desktop/AIML/Datasets/Bible/embeddings.txt", "~/Desktop/AIML/Datasets/Bible/Vocab.txt")
model.most_similar(positive=['woman', 'king'], negative=['man'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment