ByungSunBae/Kor_word_embedding_mod.py

## Kor_word_embedding_mod.py

# coding: utf-8

# From : https://gist.github.com/haven-jeon/6b508f4547418ab26f6e56b7a831dd9a#file-word2vec-ipynb

# !git clone https://github.com/haven-jeon/KoWordSpacing
# !bunzip2 KoWordSpacing/input.txt.bz2
# # 임베딩 평가 데이터
# !git clone https://github.com/SungjoonPark/KoreanWordVectors

import time
import warnings
import logging
import random
warnings.filterwarnings('ignore')

import mxnet as mx
import gluonnlp as nlp
from mxnet.gluon import nn
import numpy as np
from mxnet import nd, gluon, autograd
import itertools

from konlpy.tag import Mecab
import re


# 라인단위 형태소 분석을 통해 토큰을 추출하여 이를 list 형태로 리턴함


mecab = Mecab()
sejong_dataset = nlp.data.dataset.CorpusDataset('KoWordSpacing/input.txt',
                                                tokenizer = lambda x : mecab.morphs(x.strip()))

sejong_dataset[0][:10]

counter = nlp.data.count_tokens(itertools.chain.from_iterable(sejong_dataset))
vocab = nlp.Vocab(counter, unknown_token='<unk>', padding_token=None, bos_token=None, eos_token=None, min_freq=5)


# 단어는 인덱스로 접근되며, 인덱스 번호로 토큰에 접근할 수 있다.

for word in vocab.idx_to_token[:10]:
    print(word)


# 토큰을 기준으로 인덱스 번호도 출력 할 수 있다.

print(vocab.token_to_idx["<unk>"], vocab["<unk>"])
print(vocab.token_to_idx["아침"], vocab["아침"])


frequent_token_subsampling = 1e-5
idx_to_counts = np.array([counter[w] for w in vocab.idx_to_token])
f = idx_to_counts / np.sum(idx_to_counts)
idx_to_pdiscard =  1 - np.sqrt(frequent_token_subsampling / f)
coded_dataset = [[vocab[token] for token in sentence
                  if token in vocab
                  and random.uniform(0, 1) > idx_to_pdiscard[vocab[token]]] for sentence in sejong_dataset]


batch_size = 2048
batchify = nlp.data.batchify.EmbeddingCenterContextBatchify(batch_size=batch_size, window_size=5, shuffle=True)
context_sampler = batchify(coded_dataset)

negative_weights = nd.array([counter[w] ** 0.75 for w in vocab.idx_to_token])


ctx = mx.gpu()


class embedding_model(nn.Block):
    def __init__(self, input_dim, output_dim, neg_weight, num_neg=5, batch_size=2048):
        super(embedding_model, self).__init__()
        self.num_neg = num_neg
        self.negatives_sampler = nlp.data.UnigramCandidateSampler(weights=neg_weight, shape=(batch_size, 1))
        with self.name_scope():
            #center word embedding
            self.w  = nn.Embedding(input_dim, output_dim)
            #context words embedding
            self.w_ = nn.Embedding(input_dim, output_dim)

    def forward(self, center, context, context_mask):
        #이렇게 해주면
        #nd.array를 선언시 디바이스를 지정하지 않아도 된다.
        #멀티 GPU 학습시 필수
        with center.context:
            #주변단어의 self.num_neg 배수 만큼 비 주변단어를 생성한다.
            #negs = self.negatives_sampler(context)
            #negs = nd.concat(*[self.negatives_sampler(context).reshape(-1,1) for _ in range(0, self.num_neg)], dim=1)

            #####수정된 부분1#####
            negs = nd.concat(*[self.negatives_sampler(context) for _ in range(0, self.num_neg)], dim=1)
            ######################

            negs = negs.as_in_context(center.context)

            context_negs = nd.concat(context, negs, dim=1)
            embed_c = self.w(center)
            #(n_batch, context_length, embedding_vector)
            embed_u = self.w_(context_negs)

            #컨텍스트 마스크의 크기를 self.num_neg 만큼 복제해 값이 있는 영역을 표현한다.
            #결국 주어진 주변단어 수 * self.num_neg 만큼만 학습을 하게 된다.
            context_neg_mask = context_mask.tile((1, 1 + self.num_neg))

            #(n_batch, 1 , embedding_vector) * (n_batch, embedding_vector, context_length)
            #(n_batch, 1, context_length)
            pred = nd.batch_dot(embed_c, embed_u.transpose((0,2,1)))
            pred = pred.squeeze() * context_neg_mask

            #네거티브 샘플들은 레이블이 모두 0이다.
            label = nd.concat(context_mask, nd.zeros_like(negs), dim=1)
        return pred, label


# In[19]:


import pandas as pd

wv_golden = pd.read_csv('KoreanWordVectors/WS353_korean.csv')

word1 = wv_golden['word 1']

word2 = wv_golden['word 2']

score = wv_golden['kor_score']

res = [[vocab.token_to_idx[i],vocab.token_to_idx[j],k] for i,j,k in zip(word1, word2, score)
       if vocab.token_to_idx[i] != 0 and vocab.token_to_idx[j] != 0]

word12score = nd.array(res, ctx=ctx)

word1, word2, scores = (word12score[:,0], word12score[:,1], word12score[:,2])


def pearson_correlation(w2v, word1, word2, scores):
    from scipy import stats
    evaluator = nlp.embedding.evaluation.WordEmbeddingSimilarity(
        idx_to_vec=w2v,
        similarity_function="CosineSimilarity")
    evaluator.initialize(ctx=ctx)
    evaluator.hybridize()
    pred = evaluator(word1, word2)
    scorr = stats.spearmanr(pred.asnumpy(), scores.asnumpy())
    return(scorr)


# In[20]:


wv_golden.head(10)


# In[ ]:


from tqdm import tqdm

ctx = mx.gpu()

num_negs = 15
vocab_size = len(vocab.idx_to_token)
vec_size = 200

embed = embedding_model(vocab_size, vec_size, negative_weights, num_negs, batch_size)
embed.initialize(mx.init.Xavier(), ctx=ctx)

loss = gluon.loss.SigmoidBinaryCrossEntropyLoss()
optimizer = gluon.Trainer(embed.collect_params(), 'adam', {'learning_rate':0.001})

avg_loss = []
corrs = []
interval = 50

epoch = 100

for e in range(epoch):
    for i, batch in enumerate(tqdm(context_sampler)):
        #center, tuples  = [nd.array(d).as_in_context(ctx) for d in ]
        #####수정된 부분2#####
        center, tuples  = batch
        context_mask, _, context = tuples
        center = nd.array(center.reshape(-1,1), ctx=ctx)
        context = nd.array(context.reshape(-1,1), ctx=ctx)
        context_mask = nd.array(context_mask.reshape(-1,1), ctx=ctx)
        ######################
        with autograd.record():
            pred, label = embed(center, context, context_mask)
            loss_val = loss(pred, label)
        loss_val.backward()
        optimizer.step(center.shape[0])
        avg_loss.append(loss_val.mean().asscalar())

    corr = pearson_correlation(embed.w.weight.data(), word1, word2, scores)
    corrs.append(corr.correlation)
    if e % 10 == 0 or e == epoch - 1:
        print("{} epoch, loss {}, corr".format(e + 1, loss_val.mean().asscalar()), corr.correlation)


center[20,], context[20,], context_mask[20,]
corrs_pd = pd.DataFrame({'epoch':list(range(1, 101)), 'corr':corrs})
corrs_pd.iloc[corrs_pd["corr"].argmax(),:]

#get_ipython().run_line_magic('matplotlib', 'inline')

import matplotlib.pyplot as plt

corrs_pd.plot(x='epoch', y='corr', title='Spearman Rank Correlation')
plt.savefig('spcorr.png', dpi=300)

	# coding: utf-8

	# From : https://gist.github.com/haven-jeon/6b508f4547418ab26f6e56b7a831dd9a#file-word2vec-ipynb

	# !git clone https://github.com/haven-jeon/KoWordSpacing
	# !bunzip2 KoWordSpacing/input.txt.bz2
	# # 임베딩 평가 데이터
	# !git clone https://github.com/SungjoonPark/KoreanWordVectors

	import time
	import warnings
	import logging
	import random
	warnings.filterwarnings('ignore')

	import mxnet as mx
	import gluonnlp as nlp
	from mxnet.gluon import nn
	import numpy as np
	from mxnet import nd, gluon, autograd
	import itertools

	from konlpy.tag import Mecab
	import re


	# 라인단위 형태소 분석을 통해 토큰을 추출하여 이를 list 형태로 리턴함


	mecab = Mecab()
	sejong_dataset = nlp.data.dataset.CorpusDataset('KoWordSpacing/input.txt',
	tokenizer = lambda x : mecab.morphs(x.strip()))

	sejong_dataset[0][:10]

	counter = nlp.data.count_tokens(itertools.chain.from_iterable(sejong_dataset))
	vocab = nlp.Vocab(counter, unknown_token='<unk>', padding_token=None, bos_token=None, eos_token=None, min_freq=5)


	# 단어는 인덱스로 접근되며, 인덱스 번호로 토큰에 접근할 수 있다.

	for word in vocab.idx_to_token[:10]:
	print(word)


	# 토큰을 기준으로 인덱스 번호도 출력 할 수 있다.

	print(vocab.token_to_idx["<unk>"], vocab["<unk>"])
	print(vocab.token_to_idx["아침"], vocab["아침"])


	frequent_token_subsampling = 1e-5
	idx_to_counts = np.array([counter[w] for w in vocab.idx_to_token])
	f = idx_to_counts / np.sum(idx_to_counts)
	idx_to_pdiscard = 1 - np.sqrt(frequent_token_subsampling / f)
	coded_dataset = [[vocab[token] for token in sentence
	if token in vocab
	and random.uniform(0, 1) > idx_to_pdiscard[vocab[token]]] for sentence in sejong_dataset]


	batch_size = 2048
	batchify = nlp.data.batchify.EmbeddingCenterContextBatchify(batch_size=batch_size, window_size=5, shuffle=True)
	context_sampler = batchify(coded_dataset)

	negative_weights = nd.array([counter[w] ** 0.75 for w in vocab.idx_to_token])


	ctx = mx.gpu()


	class embedding_model(nn.Block):
	def __init__(self, input_dim, output_dim, neg_weight, num_neg=5, batch_size=2048):
	super(embedding_model, self).__init__()
	self.num_neg = num_neg
	self.negatives_sampler = nlp.data.UnigramCandidateSampler(weights=neg_weight, shape=(batch_size, 1))
	with self.name_scope():
	#center word embedding
	self.w = nn.Embedding(input_dim, output_dim)
	#context words embedding
	self.w_ = nn.Embedding(input_dim, output_dim)

	def forward(self, center, context, context_mask):
	#이렇게 해주면
	#nd.array를 선언시 디바이스를 지정하지 않아도 된다.
	#멀티 GPU 학습시 필수
	with center.context:
	#주변단어의 self.num_neg 배수 만큼 비 주변단어를 생성한다.
	#negs = self.negatives_sampler(context)
	#negs = nd.concat(*[self.negatives_sampler(context).reshape(-1,1) for _ in range(0, self.num_neg)], dim=1)

	#####수정된 부분1#####
	negs = nd.concat(*[self.negatives_sampler(context) for _ in range(0, self.num_neg)], dim=1)
	######################

	negs = negs.as_in_context(center.context)

	context_negs = nd.concat(context, negs, dim=1)
	embed_c = self.w(center)
	#(n_batch, context_length, embedding_vector)
	embed_u = self.w_(context_negs)

	#컨텍스트 마스크의 크기를 self.num_neg 만큼 복제해 값이 있는 영역을 표현한다.
	#결국 주어진 주변단어 수 * self.num_neg 만큼만 학습을 하게 된다.
	context_neg_mask = context_mask.tile((1, 1 + self.num_neg))

	#(n_batch, 1 , embedding_vector) * (n_batch, embedding_vector, context_length)
	#(n_batch, 1, context_length)
	pred = nd.batch_dot(embed_c, embed_u.transpose((0,2,1)))
	pred = pred.squeeze() * context_neg_mask

	#네거티브 샘플들은 레이블이 모두 0이다.
	label = nd.concat(context_mask, nd.zeros_like(negs), dim=1)
	return pred, label


	# In[19]:


	import pandas as pd

	wv_golden = pd.read_csv('KoreanWordVectors/WS353_korean.csv')

	word1 = wv_golden['word 1']

	word2 = wv_golden['word 2']

	score = wv_golden['kor_score']

	res = [[vocab.token_to_idx[i],vocab.token_to_idx[j],k] for i,j,k in zip(word1, word2, score)
	if vocab.token_to_idx[i] != 0 and vocab.token_to_idx[j] != 0]

	word12score = nd.array(res, ctx=ctx)

	word1, word2, scores = (word12score[:,0], word12score[:,1], word12score[:,2])


	def pearson_correlation(w2v, word1, word2, scores):
	from scipy import stats
	evaluator = nlp.embedding.evaluation.WordEmbeddingSimilarity(
	idx_to_vec=w2v,
	similarity_function="CosineSimilarity")
	evaluator.initialize(ctx=ctx)
	evaluator.hybridize()
	pred = evaluator(word1, word2)
	scorr = stats.spearmanr(pred.asnumpy(), scores.asnumpy())
	return(scorr)


	# In[20]:


	wv_golden.head(10)


	# In[ ]:


	from tqdm import tqdm

	ctx = mx.gpu()

	num_negs = 15
	vocab_size = len(vocab.idx_to_token)
	vec_size = 200

	embed = embedding_model(vocab_size, vec_size, negative_weights, num_negs, batch_size)
	embed.initialize(mx.init.Xavier(), ctx=ctx)

	loss = gluon.loss.SigmoidBinaryCrossEntropyLoss()
	optimizer = gluon.Trainer(embed.collect_params(), 'adam', {'learning_rate':0.001})

	avg_loss = []
	corrs = []
	interval = 50

	epoch = 100

	for e in range(epoch):
	for i, batch in enumerate(tqdm(context_sampler)):
	#center, tuples = [nd.array(d).as_in_context(ctx) for d in ]
	#####수정된 부분2#####
	center, tuples = batch
	context_mask, _, context = tuples
	center = nd.array(center.reshape(-1,1), ctx=ctx)
	context = nd.array(context.reshape(-1,1), ctx=ctx)
	context_mask = nd.array(context_mask.reshape(-1,1), ctx=ctx)
	######################
	with autograd.record():
	pred, label = embed(center, context, context_mask)
	loss_val = loss(pred, label)
	loss_val.backward()
	optimizer.step(center.shape[0])
	avg_loss.append(loss_val.mean().asscalar())

	corr = pearson_correlation(embed.w.weight.data(), word1, word2, scores)
	corrs.append(corr.correlation)
	if e % 10 == 0 or e == epoch - 1:
	print("{} epoch, loss {}, corr".format(e + 1, loss_val.mean().asscalar()), corr.correlation)


	center[20,], context[20,], context_mask[20,]
	corrs_pd = pd.DataFrame({'epoch':list(range(1, 101)), 'corr':corrs})
	corrs_pd.iloc[corrs_pd["corr"].argmax(),:]

	#get_ipython().run_line_magic('matplotlib', 'inline')

	import matplotlib.pyplot as plt

	corrs_pd.plot(x='epoch', y='corr', title='Spearman Rank Correlation')
	plt.savefig('spcorr.png', dpi=300)