Skip to content

Instantly share code, notes, and snippets.

@yaronv
Last active September 5, 2018 06:34
Show Gist options
  • Save yaronv/fb4e526b3fadd7ed497a1d2110ce37be to your computer and use it in GitHub Desktop.
Save yaronv/fb4e526b3fadd7ed497a1d2110ce37be to your computer and use it in GitHub Desktop.
class Doc2VecTrainer(object):
def __init__(self, train_corpus):
self.train_corpus = train_corpus
def run(self):
print('app started')
cores = multiprocessing.cpu_count()
print('num of cores is %s' % cores)
gc.collect()
if load_existing:
print('loading an exiting model')
model = Doc2Vec.load(PATH_TO_EXISTING_MODEL)
else:
print('reading training corpus from %s' % self.train_corpus)
corpus_data = MyCorpus(self.train_corpus)
model = Doc2Vec(size=model_dimensions, window=10, min_count=3, sample=1e-4, negative=5, workers=cores, dm=1)
print('building vocabulary...')
model.build_vocab(corpus_data)
model.train(corpus_data, total_examples=model.corpus_count, epochs=20)
model.save(doc2vec_model)
model.save_word2vec_format(word2vec_model)
print('total docs learned %s' % (len(model.docvecs)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment