Skip to content

Instantly share code, notes, and snippets.

@zygm0nt
Created August 16, 2018 07:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zygm0nt/b981b54035003b622d3af6cf098ad391 to your computer and use it in GitHub Desktop.
Save zygm0nt/b981b54035003b622d3af6cf098ad391 to your computer and use it in GitHub Desktop.
class X:
def read_all_sessions(self):
all_sessions = []
for file_path in tqdm(
iterable=[os.path.join(self.input()[0].path, f_name) for f_name in
os.listdir(self.input()[0].path)],
mininterval=1.0, desc='Loading sessions'):
with gzip.open(file_path) as stream:
for l in stream:
all_sessions.append(l.split())
return all_sessions
def run(self):
if not os.path.exists(self.work_dir()):
os.makedirs(self.work_dir())
model = Word2Vec(sentences=self.read_all_sessions(), size=self.vector_size, window=self.window_size,
min_count=self.min_count, workers=self.parallelism, sg=self.skip_gram, iter=self.epoch_num)
logger.info('Writing item vectors to {}'.format(self.output().path))
with open(self.output().path, 'w') as f:
for w in model.wv.vocab:
f.write('{} {}\n'.format(w, ' '.join((str(v) for v in model.wv[w]))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment