-
-
Save xeoncross/d573acb857b1e2a180cb3a96fe045898 to your computer and use it in GitHub Desktop.
Using Word2Vec experiment on the Bible.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gensim.utils import simple_preprocess | |
tokenize = lambda x: simple_preprocess(x) | |
# tokenize("We can load the vocabulary from the JSON file, and generate a reverse mapping (from index to word, so that we can decode an encoded string if we want)?!") | |
import os | |
import json | |
import numpy as np | |
from gensim.models import Word2Vec | |
def create_embeddings(data_dir, embeddings_path, vocab_path, **params): | |
class SentenceGenerator(object): | |
def __init__(self, dirname): | |
self.dirname = dirname | |
def __iter__(self): | |
for fname in os.listdir(self.dirname): | |
for line in open(os.path.join(self.dirname, fname), encoding='utf-8', errors='ignore'): | |
line = line.encode('utf-8').strip() | |
yield tokenize(line) | |
sentences = SentenceGenerator(data_dir) | |
model = Word2Vec(sentences, **params) | |
weights = model.syn0 | |
np.save(open(embeddings_path, 'wb'), weights) | |
vocab = dict([(k, v.index) for k, v in model.vocab.items()]) | |
with open(vocab_path, 'w') as f: | |
f.write(json.dumps(vocab)) | |
return model | |
model = create_embeddings("~/Desktop/AIML/Datasets/Bible/", "~/Desktop/AIML/Datasets/Bible/embeddings.txt", "~/Desktop/AIML/Datasets/Bible/Vocab.txt") | |
model.most_similar(positive=['woman', 'king'], negative=['man']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment