Created
February 14, 2019 09:58
-
-
Save ByungSunBae/53ae21c395ef7270b953114fbd804858 to your computer and use it in GitHub Desktop.
Simple Korean word embedding (modified) with mxnet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# From : https://gist.github.com/haven-jeon/6b508f4547418ab26f6e56b7a831dd9a#file-word2vec-ipynb | |
# !git clone https://github.com/haven-jeon/KoWordSpacing | |
# !bunzip2 KoWordSpacing/input.txt.bz2 | |
# # 임베딩 평가 데이터 | |
# !git clone https://github.com/SungjoonPark/KoreanWordVectors | |
import time | |
import warnings | |
import logging | |
import random | |
warnings.filterwarnings('ignore') | |
import mxnet as mx | |
import gluonnlp as nlp | |
from mxnet.gluon import nn | |
import numpy as np | |
from mxnet import nd, gluon, autograd | |
import itertools | |
from konlpy.tag import Mecab | |
import re | |
# 라인단위 형태소 분석을 통해 토큰을 추출하여 이를 list 형태로 리턴함 | |
mecab = Mecab() | |
sejong_dataset = nlp.data.dataset.CorpusDataset('KoWordSpacing/input.txt', | |
tokenizer = lambda x : mecab.morphs(x.strip())) | |
sejong_dataset[0][:10] | |
counter = nlp.data.count_tokens(itertools.chain.from_iterable(sejong_dataset)) | |
vocab = nlp.Vocab(counter, unknown_token='<unk>', padding_token=None, bos_token=None, eos_token=None, min_freq=5) | |
# 단어는 인덱스로 접근되며, 인덱스 번호로 토큰에 접근할 수 있다. | |
for word in vocab.idx_to_token[:10]: | |
print(word) | |
# 토큰을 기준으로 인덱스 번호도 출력 할 수 있다. | |
print(vocab.token_to_idx["<unk>"], vocab["<unk>"]) | |
print(vocab.token_to_idx["아침"], vocab["아침"]) | |
frequent_token_subsampling = 1e-5 | |
idx_to_counts = np.array([counter[w] for w in vocab.idx_to_token]) | |
f = idx_to_counts / np.sum(idx_to_counts) | |
idx_to_pdiscard = 1 - np.sqrt(frequent_token_subsampling / f) | |
coded_dataset = [[vocab[token] for token in sentence | |
if token in vocab | |
and random.uniform(0, 1) > idx_to_pdiscard[vocab[token]]] for sentence in sejong_dataset] | |
batch_size = 2048 | |
batchify = nlp.data.batchify.EmbeddingCenterContextBatchify(batch_size=batch_size, window_size=5, shuffle=True) | |
context_sampler = batchify(coded_dataset) | |
negative_weights = nd.array([counter[w] ** 0.75 for w in vocab.idx_to_token]) | |
ctx = mx.gpu() | |
class embedding_model(nn.Block): | |
def __init__(self, input_dim, output_dim, neg_weight, num_neg=5, batch_size=2048): | |
super(embedding_model, self).__init__() | |
self.num_neg = num_neg | |
self.negatives_sampler = nlp.data.UnigramCandidateSampler(weights=neg_weight, shape=(batch_size, 1)) | |
with self.name_scope(): | |
#center word embedding | |
self.w = nn.Embedding(input_dim, output_dim) | |
#context words embedding | |
self.w_ = nn.Embedding(input_dim, output_dim) | |
def forward(self, center, context, context_mask): | |
#이렇게 해주면 | |
#nd.array를 선언시 디바이스를 지정하지 않아도 된다. | |
#멀티 GPU 학습시 필수 | |
with center.context: | |
#주변단어의 self.num_neg 배수 만큼 비 주변단어를 생성한다. | |
#negs = self.negatives_sampler(context) | |
#negs = nd.concat(*[self.negatives_sampler(context).reshape(-1,1) for _ in range(0, self.num_neg)], dim=1) | |
#####수정된 부분1##### | |
negs = nd.concat(*[self.negatives_sampler(context) for _ in range(0, self.num_neg)], dim=1) | |
###################### | |
negs = negs.as_in_context(center.context) | |
context_negs = nd.concat(context, negs, dim=1) | |
embed_c = self.w(center) | |
#(n_batch, context_length, embedding_vector) | |
embed_u = self.w_(context_negs) | |
#컨텍스트 마스크의 크기를 self.num_neg 만큼 복제해 값이 있는 영역을 표현한다. | |
#결국 주어진 주변단어 수 * self.num_neg 만큼만 학습을 하게 된다. | |
context_neg_mask = context_mask.tile((1, 1 + self.num_neg)) | |
#(n_batch, 1 , embedding_vector) * (n_batch, embedding_vector, context_length) | |
#(n_batch, 1, context_length) | |
pred = nd.batch_dot(embed_c, embed_u.transpose((0,2,1))) | |
pred = pred.squeeze() * context_neg_mask | |
#네거티브 샘플들은 레이블이 모두 0이다. | |
label = nd.concat(context_mask, nd.zeros_like(negs), dim=1) | |
return pred, label | |
# In[19]: | |
import pandas as pd | |
wv_golden = pd.read_csv('KoreanWordVectors/WS353_korean.csv') | |
word1 = wv_golden['word 1'] | |
word2 = wv_golden['word 2'] | |
score = wv_golden['kor_score'] | |
res = [[vocab.token_to_idx[i],vocab.token_to_idx[j],k] for i,j,k in zip(word1, word2, score) | |
if vocab.token_to_idx[i] != 0 and vocab.token_to_idx[j] != 0] | |
word12score = nd.array(res, ctx=ctx) | |
word1, word2, scores = (word12score[:,0], word12score[:,1], word12score[:,2]) | |
def pearson_correlation(w2v, word1, word2, scores): | |
from scipy import stats | |
evaluator = nlp.embedding.evaluation.WordEmbeddingSimilarity( | |
idx_to_vec=w2v, | |
similarity_function="CosineSimilarity") | |
evaluator.initialize(ctx=ctx) | |
evaluator.hybridize() | |
pred = evaluator(word1, word2) | |
scorr = stats.spearmanr(pred.asnumpy(), scores.asnumpy()) | |
return(scorr) | |
# In[20]: | |
wv_golden.head(10) | |
# In[ ]: | |
from tqdm import tqdm | |
ctx = mx.gpu() | |
num_negs = 15 | |
vocab_size = len(vocab.idx_to_token) | |
vec_size = 200 | |
embed = embedding_model(vocab_size, vec_size, negative_weights, num_negs, batch_size) | |
embed.initialize(mx.init.Xavier(), ctx=ctx) | |
loss = gluon.loss.SigmoidBinaryCrossEntropyLoss() | |
optimizer = gluon.Trainer(embed.collect_params(), 'adam', {'learning_rate':0.001}) | |
avg_loss = [] | |
corrs = [] | |
interval = 50 | |
epoch = 100 | |
for e in range(epoch): | |
for i, batch in enumerate(tqdm(context_sampler)): | |
#center, tuples = [nd.array(d).as_in_context(ctx) for d in ] | |
#####수정된 부분2##### | |
center, tuples = batch | |
context_mask, _, context = tuples | |
center = nd.array(center.reshape(-1,1), ctx=ctx) | |
context = nd.array(context.reshape(-1,1), ctx=ctx) | |
context_mask = nd.array(context_mask.reshape(-1,1), ctx=ctx) | |
###################### | |
with autograd.record(): | |
pred, label = embed(center, context, context_mask) | |
loss_val = loss(pred, label) | |
loss_val.backward() | |
optimizer.step(center.shape[0]) | |
avg_loss.append(loss_val.mean().asscalar()) | |
corr = pearson_correlation(embed.w.weight.data(), word1, word2, scores) | |
corrs.append(corr.correlation) | |
if e % 10 == 0 or e == epoch - 1: | |
print("{} epoch, loss {}, corr".format(e + 1, loss_val.mean().asscalar()), corr.correlation) | |
center[20,], context[20,], context_mask[20,] | |
corrs_pd = pd.DataFrame({'epoch':list(range(1, 101)), 'corr':corrs}) | |
corrs_pd.iloc[corrs_pd["corr"].argmax(),:] | |
#get_ipython().run_line_magic('matplotlib', 'inline') | |
import matplotlib.pyplot as plt | |
corrs_pd.plot(x='epoch', y='corr', title='Spearman Rank Correlation') | |
plt.savefig('spcorr.png', dpi=300) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment