Skip to content

Instantly share code, notes, and snippets.

@kleysonr
Created October 31, 2019 10:38
Show Gist options
  • Save kleysonr/2b414a5aceaf37898e3a86f0f3837dc3 to your computer and use it in GitHub Desktop.
Save kleysonr/2b414a5aceaf37898e3a86f0f3837dc3 to your computer and use it in GitHub Desktop.
import gensim
import codecs
import logging
from nltk.tokenize import word_tokenize
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def read_data(txtfile):
with codecs.open(txtfile, 'r', 'utf-8') as f:
for i,line in enumerate (f):
if (i%10000==0):
logging.info ("read {0} documents".format (i))
t = line.rstrip()
yield word_tokenize(t)
documents = [line for line in read_data('dataset/preprocessed_data.txt')]
model = gensim.models.Word2Vec(documents, size=100, window=5, min_count=1, workers=7)
model.train(documents,total_examples=len(documents),epochs=10)
# Save embedding file
model.wv.save_word2vec_format('embeddings/word2vec-s100.vec')
# Save model
model.save('models/word2vec-s100.model')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment