Skip to content

Instantly share code, notes, and snippets.

@joachimdb
Last active February 10, 2021 13:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joachimdb/e078b16e951a827c46cb462867616ba7 to your computer and use it in GitHub Desktop.
Save joachimdb/e078b16e951a827c46cb462867616ba7 to your computer and use it in GitHub Desktop.
from gensim.corpora import WikiCorpus
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
import time
wiki_corpus = WikiCorpus("/data/enwiki-latest-pages-articles-multistream.xml.bz2", dictionary={})
phrases = Phrases(wiki_corpus.get_texts(), scoring='npmi', threshold=0.5, min_count=5)
phraser = Phraser(phrases)
def secondsToStr(t):
return "%d:%02d:%02d.%03d" % \
reduce(lambda ll,b : divmod(ll[0],b) + ll[1:], [(t*1000,),1000,60,60])
start_time = time.time()
class EpochLogger(CallbackAny2Vec):
'''Callback to log information about training'''
def __init__(self):
self.epoch = 0
def on_epoch_begin(self, model):
print("Epoch #{} start".format(self.epoch))
def on_epoch_end(self, model):
print("Epoch #{} end".format(self.epoch))
print("total elapsed time:", secondsToStr(time.time() - start_time))
self.epoch += 1
wiki_corpus = WikiCorpus("/data/enwiki-latest-pages-articles-multistream.xml.bz2", dictionary={})
model = Word2Vec(phraser[wiki_corpus.get_texts()], callbacks=[EpochLogger()])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment