Create a gist now

Instantly share code, notes, and snippets.

Embed
What would you like to do?
class Doc2VecTrainer(object):
def __init__(self, train_corpus):
self.train_corpus = train_corpus
def run(self):
print('app started')
cores = multiprocessing.cpu_count()
print('num of cores is %s' % cores)
gc.collect()
if load_existing:
print('loading an exiting model')
model = Doc2Vec.load(PATH_TO_EXISTING_MODEL)
else:
print('reading training corpus from %s' % self.train_corpus)
corpus_data = DocumentsIterable([self.train_corpus])
model = Doc2Vec(size=model_dimensions, window=10, min_count=3, sample=1e-4, negative=5, workers=cores, dm=1)
print('building vocabulary...')
model.build_vocab(corpus_data)
# start training the model
for epoch in range(epochs):
print ('Now training epoch %s' % epoch)
shuffle(corpus_data)
model.train(corpus_data, total_examples=model.corpus_count, epochs=model.iter)
# model.alpha -= 0.002 # decrease the learning rate
# model.min_alpha = model.alpha # fix the learning rate, no decay
model.save(doc2vec_model)
model.save_word2vec_format(word2vec_model)
print('total docs learned %s' % (len(model.docvecs)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment