Skip to content

Instantly share code, notes, and snippets.

@Koziev
Created September 28, 2018 14:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Koziev/e39689adec30ae5bf6afaa1ca47c08e5 to your computer and use it in GitHub Desktop.
Save Koziev/e39689adec30ae5bf6afaa1ca47c08e5 to your computer and use it in GitHub Desktop.
Генерация файла w2v.CBOW=1_WIN=5_DIM=32.bin, используемого в чатботе
# -*- coding: utf-8 -*-
'''
Генерация word2vector моделей для слов.
Используется готовый корпус, в котором каждое слово отделено пробелами, и каждое
предложение находится на отдельной строке.
'''
from __future__ import print_function
from gensim.models import word2vec
import logging
import os
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
#corpus_path = os.path.expanduser('~/Corpus/word2vector/ru/SENTx.corpus.w2v.txt')
corpus_path = os.path.expanduser(r'f:\Corpus\word2vector\ru\SENTx.corpus.w2v.txt')
#corpus_path = os.path.expanduser('~/Corpus/Raw/ru/tokenized_w2v.txt')
SIZE=32
WINDOW=5
CBOW=1
MIN_COUNT=1
NB_ITERS=1
filename = 'w2v.CBOW=' + str(CBOW)+'_WIN=' + str(WINDOW) + '_DIM='+str(SIZE)
# в отдельный текстовый файл выведем все параметры модели
with open( filename + '.info', 'w+') as info_file:
print('corpus_path=', corpus_path, file=info_file)
print('SIZE=', SIZE, file=info_file)
print('WINDOW=', WINDOW, file=info_file)
print('CBOW=', CBOW, file=info_file)
print('MIN_COUNT=', MIN_COUNT, file=info_file)
print('NB_ITERS=', NB_ITERS, file=info_file)
# начинаем обучение w2v
#sentences = word2vec.Text8Corpus(corpus_path)
sentences = word2vec.LineSentence(corpus_path)
model = word2vec.Word2Vec(sentences,
size=SIZE,
window=WINDOW,
cbow_mean=CBOW,
min_count=MIN_COUNT,
workers=4,
sorted_vocab=1,
iter=NB_ITERS)
model.init_sims(replace=True)
# сохраняем готовую w2v модель
#model.save_word2vec_format( filename + '.model', binary=True)
model.wv.save_word2vec_format( filename + '.bin', binary=True)
#model.wv.save_word2vec_format( filename + '.txt', binary=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment