Skip to content

Instantly share code, notes, and snippets.

@ByungSunBae
Created February 14, 2019 09:58
Show Gist options
  • Save ByungSunBae/53ae21c395ef7270b953114fbd804858 to your computer and use it in GitHub Desktop.
Save ByungSunBae/53ae21c395ef7270b953114fbd804858 to your computer and use it in GitHub Desktop.
Simple Korean word embedding (modified) with mxnet
# coding: utf-8
# From : https://gist.github.com/haven-jeon/6b508f4547418ab26f6e56b7a831dd9a#file-word2vec-ipynb
# !git clone https://github.com/haven-jeon/KoWordSpacing
# !bunzip2 KoWordSpacing/input.txt.bz2
# # 임베딩 평가 데이터
# !git clone https://github.com/SungjoonPark/KoreanWordVectors
import time
import warnings
import logging
import random
warnings.filterwarnings('ignore')
import mxnet as mx
import gluonnlp as nlp
from mxnet.gluon import nn
import numpy as np
from mxnet import nd, gluon, autograd
import itertools
from konlpy.tag import Mecab
import re
# 라인단위 형태소 분석을 통해 토큰을 추출하여 이를 list 형태로 리턴함
mecab = Mecab()
sejong_dataset = nlp.data.dataset.CorpusDataset('KoWordSpacing/input.txt',
tokenizer = lambda x : mecab.morphs(x.strip()))
sejong_dataset[0][:10]
counter = nlp.data.count_tokens(itertools.chain.from_iterable(sejong_dataset))
vocab = nlp.Vocab(counter, unknown_token='<unk>', padding_token=None, bos_token=None, eos_token=None, min_freq=5)
# 단어는 인덱스로 접근되며, 인덱스 번호로 토큰에 접근할 수 있다.
for word in vocab.idx_to_token[:10]:
print(word)
# 토큰을 기준으로 인덱스 번호도 출력 할 수 있다.
print(vocab.token_to_idx["<unk>"], vocab["<unk>"])
print(vocab.token_to_idx["아침"], vocab["아침"])
frequent_token_subsampling = 1e-5
idx_to_counts = np.array([counter[w] for w in vocab.idx_to_token])
f = idx_to_counts / np.sum(idx_to_counts)
idx_to_pdiscard = 1 - np.sqrt(frequent_token_subsampling / f)
coded_dataset = [[vocab[token] for token in sentence
if token in vocab
and random.uniform(0, 1) > idx_to_pdiscard[vocab[token]]] for sentence in sejong_dataset]
batch_size = 2048
batchify = nlp.data.batchify.EmbeddingCenterContextBatchify(batch_size=batch_size, window_size=5, shuffle=True)
context_sampler = batchify(coded_dataset)
negative_weights = nd.array([counter[w] ** 0.75 for w in vocab.idx_to_token])
ctx = mx.gpu()
class embedding_model(nn.Block):
def __init__(self, input_dim, output_dim, neg_weight, num_neg=5, batch_size=2048):
super(embedding_model, self).__init__()
self.num_neg = num_neg
self.negatives_sampler = nlp.data.UnigramCandidateSampler(weights=neg_weight, shape=(batch_size, 1))
with self.name_scope():
#center word embedding
self.w = nn.Embedding(input_dim, output_dim)
#context words embedding
self.w_ = nn.Embedding(input_dim, output_dim)
def forward(self, center, context, context_mask):
#이렇게 해주면
#nd.array를 선언시 디바이스를 지정하지 않아도 된다.
#멀티 GPU 학습시 필수
with center.context:
#주변단어의 self.num_neg 배수 만큼 비 주변단어를 생성한다.
#negs = self.negatives_sampler(context)
#negs = nd.concat(*[self.negatives_sampler(context).reshape(-1,1) for _ in range(0, self.num_neg)], dim=1)
#####수정된 부분1#####
negs = nd.concat(*[self.negatives_sampler(context) for _ in range(0, self.num_neg)], dim=1)
######################
negs = negs.as_in_context(center.context)
context_negs = nd.concat(context, negs, dim=1)
embed_c = self.w(center)
#(n_batch, context_length, embedding_vector)
embed_u = self.w_(context_negs)
#컨텍스트 마스크의 크기를 self.num_neg 만큼 복제해 값이 있는 영역을 표현한다.
#결국 주어진 주변단어 수 * self.num_neg 만큼만 학습을 하게 된다.
context_neg_mask = context_mask.tile((1, 1 + self.num_neg))
#(n_batch, 1 , embedding_vector) * (n_batch, embedding_vector, context_length)
#(n_batch, 1, context_length)
pred = nd.batch_dot(embed_c, embed_u.transpose((0,2,1)))
pred = pred.squeeze() * context_neg_mask
#네거티브 샘플들은 레이블이 모두 0이다.
label = nd.concat(context_mask, nd.zeros_like(negs), dim=1)
return pred, label
# In[19]:
import pandas as pd
wv_golden = pd.read_csv('KoreanWordVectors/WS353_korean.csv')
word1 = wv_golden['word 1']
word2 = wv_golden['word 2']
score = wv_golden['kor_score']
res = [[vocab.token_to_idx[i],vocab.token_to_idx[j],k] for i,j,k in zip(word1, word2, score)
if vocab.token_to_idx[i] != 0 and vocab.token_to_idx[j] != 0]
word12score = nd.array(res, ctx=ctx)
word1, word2, scores = (word12score[:,0], word12score[:,1], word12score[:,2])
def pearson_correlation(w2v, word1, word2, scores):
from scipy import stats
evaluator = nlp.embedding.evaluation.WordEmbeddingSimilarity(
idx_to_vec=w2v,
similarity_function="CosineSimilarity")
evaluator.initialize(ctx=ctx)
evaluator.hybridize()
pred = evaluator(word1, word2)
scorr = stats.spearmanr(pred.asnumpy(), scores.asnumpy())
return(scorr)
# In[20]:
wv_golden.head(10)
# In[ ]:
from tqdm import tqdm
ctx = mx.gpu()
num_negs = 15
vocab_size = len(vocab.idx_to_token)
vec_size = 200
embed = embedding_model(vocab_size, vec_size, negative_weights, num_negs, batch_size)
embed.initialize(mx.init.Xavier(), ctx=ctx)
loss = gluon.loss.SigmoidBinaryCrossEntropyLoss()
optimizer = gluon.Trainer(embed.collect_params(), 'adam', {'learning_rate':0.001})
avg_loss = []
corrs = []
interval = 50
epoch = 100
for e in range(epoch):
for i, batch in enumerate(tqdm(context_sampler)):
#center, tuples = [nd.array(d).as_in_context(ctx) for d in ]
#####수정된 부분2#####
center, tuples = batch
context_mask, _, context = tuples
center = nd.array(center.reshape(-1,1), ctx=ctx)
context = nd.array(context.reshape(-1,1), ctx=ctx)
context_mask = nd.array(context_mask.reshape(-1,1), ctx=ctx)
######################
with autograd.record():
pred, label = embed(center, context, context_mask)
loss_val = loss(pred, label)
loss_val.backward()
optimizer.step(center.shape[0])
avg_loss.append(loss_val.mean().asscalar())
corr = pearson_correlation(embed.w.weight.data(), word1, word2, scores)
corrs.append(corr.correlation)
if e % 10 == 0 or e == epoch - 1:
print("{} epoch, loss {}, corr".format(e + 1, loss_val.mean().asscalar()), corr.correlation)
center[20,], context[20,], context_mask[20,]
corrs_pd = pd.DataFrame({'epoch':list(range(1, 101)), 'corr':corrs})
corrs_pd.iloc[corrs_pd["corr"].argmax(),:]
#get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
corrs_pd.plot(x='epoch', y='corr', title='Spearman Rank Correlation')
plt.savefig('spcorr.png', dpi=300)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment