Created
December 12, 2016 06:52
-
-
Save Orbifold/555a73fe0350d6e1ca675975c3b67ca1 to your computer and use it in GitHub Desktop.
Using Word2Vec experiment on the Bible.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gensim.utils import simple_preprocess | |
tokenize = lambda x: simple_preprocess(x) | |
# tokenize("We can load the vocabulary from the JSON file, and generate a reverse mapping (from index to word, so that we can decode an encoded string if we want)?!") | |
import os | |
import json | |
import numpy as np | |
from gensim.models import Word2Vec | |
def create_embeddings(data_dir, embeddings_path, vocab_path, **params): | |
class SentenceGenerator(object): | |
def __init__(self, dirname): | |
self.dirname = dirname | |
def __iter__(self): | |
for fname in os.listdir(self.dirname): | |
for line in open(os.path.join(self.dirname, fname), encoding='utf-8', errors='ignore'): | |
line = line.encode('utf-8').strip() | |
yield tokenize(line) | |
sentences = SentenceGenerator(data_dir) | |
model = Word2Vec(sentences, **params) | |
weights = model.syn0 | |
np.save(open(embeddings_path, 'wb'), weights) | |
vocab = dict([(k, v.index) for k, v in model.vocab.items()]) | |
with open(vocab_path, 'w') as f: | |
f.write(json.dumps(vocab)) | |
return model | |
model = create_embeddings("~/Desktop/AIML/Datasets/Bible/", "~/Desktop/AIML/Datasets/Bible/embeddings.txt", "~/Desktop/AIML/Datasets/Bible/Vocab.txt") | |
model.most_similar(positive=['woman', 'king'], negative=['man']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment