Last active
December 25, 2020 17:56
-
-
Save hiropppe/cf874500822e273101b162e373bcba13 to your computer and use it in GitHub Desktop.
Gensim で作成した小さめの wv からコサイン類似度行列の作り方
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from IPython.core.interactiveshell import InteractiveShell\n", | |
"InteractiveShell.ast_node_interactivity = \"all\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import MeCab\n", | |
"\n", | |
"mecab = MeCab.Tagger('-F\\\\s%f[0][SEP]%f[6] -U\\\\s%m -E\\\\n -b 100000 -d /usr/local/lib/mecab/dic/ipadic')\n", | |
"\n", | |
"with open('stop_ja.txt') as fsw:\n", | |
" stop_words = frozenset(fsw.read().splitlines())\n", | |
"\n", | |
"lower = True\n", | |
"with open('/data/corpus.txt') as reader, open('./corpus.tok', 'w') as writer:\n", | |
" for line in reader:\n", | |
" tokens = []\n", | |
" for pos_word in mecab.parse(line).strip().split(' '):\n", | |
" if any([pos in pos_word for pos in ('名詞[SEP]', '動詞[SEP]', '形容詞[SEP]', '副詞[SEP]')]):\n", | |
" token = pos_word[pos_word.find('[SEP]')+5:]\n", | |
" tokens.append(token)\n", | |
"\n", | |
" tokens = [token.lower() if lower else token for token in tokens if token not in stop_words]\n", | |
"\n", | |
" if tokens:\n", | |
" print(' '.join(tokens), file=writer)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2020-11-10 02:21:01,266 : INFO : collecting all words and their counts\n", | |
"2020-11-10 02:21:01,268 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", | |
"2020-11-10 02:21:01,661 : INFO : collected 23740 word types from a corpus of 780333 raw words and 7155 sentences\n", | |
"2020-11-10 02:21:01,662 : INFO : Loading a fresh vocabulary\n", | |
"2020-11-10 02:21:01,708 : INFO : effective_min_count=5 retains 10710 unique words (45% of original 23740, drops 13030)\n", | |
"2020-11-10 02:21:01,709 : INFO : effective_min_count=5 leaves 756097 word corpus (96% of original 780333, drops 24236)\n", | |
"2020-11-10 02:21:01,776 : INFO : deleting the raw counts dictionary of 23740 items\n", | |
"2020-11-10 02:21:01,779 : INFO : sample=0.001 downsamples 30 most-common words\n", | |
"2020-11-10 02:21:01,780 : INFO : downsampling leaves estimated 680211 word corpus (90.0% of prior 756097)\n", | |
"2020-11-10 02:21:01,827 : INFO : estimated required memory for 10710 words and 100 dimensions: 13923000 bytes\n", | |
"2020-11-10 02:21:01,829 : INFO : resetting layer weights\n", | |
"2020-11-10 02:21:08,580 : INFO : training model with 2 workers on 10710 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=20 window=5\n", | |
"2020-11-10 02:21:09,612 : INFO : EPOCH 1 - PROGRESS: at 39.71% examples, 192691 words/s, in_qsize 3, out_qsize 0\n", | |
"2020-11-10 02:21:10,614 : INFO : EPOCH 1 - PROGRESS: at 63.76% examples, 189752 words/s, in_qsize 3, out_qsize 0\n", | |
"2020-11-10 02:21:11,650 : INFO : EPOCH 1 - PROGRESS: at 92.38% examples, 199473 words/s, in_qsize 3, out_qsize 0\n", | |
"2020-11-10 02:21:11,917 : INFO : worker thread finished; awaiting finish of 1 more threads\n", | |
"2020-11-10 02:21:11,960 : INFO : worker thread finished; awaiting finish of 0 more threads\n", | |
"2020-11-10 02:21:11,963 : INFO : EPOCH - 1 : training on 780333 raw words (680257 effective words) took 3.4s, 201393 effective words/s\n", | |
"2020-11-10 02:21:12,978 : INFO : EPOCH 2 - PROGRESS: at 38.52% examples, 187929 words/s, in_qsize 3, out_qsize 0\n", | |
"2020-11-10 02:21:14,015 : INFO : EPOCH 2 - PROGRESS: at 67.55% examples, 205395 words/s, in_qsize 3, out_qsize 0\n", | |
"2020-11-10 02:21:15,034 : INFO : EPOCH 2 - PROGRESS: at 97.57% examples, 213650 words/s, in_qsize 3, out_qsize 0\n", | |
"2020-11-10 02:21:15,118 : INFO : worker thread finished; awaiting finish of 1 more threads\n", | |
"2020-11-10 02:21:15,180 : INFO : worker thread finished; awaiting finish of 0 more threads\n", | |
"2020-11-10 02:21:15,182 : INFO : EPOCH - 2 : training on 780333 raw words (680427 effective words) took 3.2s, 211707 effective words/s\n", | |
"2020-11-10 02:21:16,204 : INFO : EPOCH 3 - PROGRESS: at 41.45% examples, 204081 words/s, in_qsize 3, out_qsize 0\n", | |
"2020-11-10 02:21:17,215 : INFO : EPOCH 3 - PROGRESS: at 65.72% examples, 199128 words/s, in_qsize 3, out_qsize 0\n", | |
"2020-11-10 02:21:18,227 : INFO : EPOCH 3 - PROGRESS: at 91.21% examples, 198536 words/s, in_qsize 3, out_qsize 0\n", | |
"2020-11-10 02:21:18,613 : INFO : worker thread finished; awaiting finish of 1 more threads\n", | |
"2020-11-10 02:21:18,672 : INFO : worker thread finished; awaiting finish of 0 more threads\n", | |
"2020-11-10 02:21:18,674 : INFO : EPOCH - 3 : training on 780333 raw words (680134 effective words) took 3.5s, 195380 effective words/s\n", | |
"2020-11-10 02:21:19,687 : INFO : EPOCH 4 - PROGRESS: at 41.45% examples, 205383 words/s, in_qsize 3, out_qsize 0\n", | |
"2020-11-10 02:21:20,690 : INFO : EPOCH 4 - PROGRESS: at 71.13% examples, 222147 words/s, in_qsize 3, out_qsize 0\n", | |
"2020-11-10 02:21:21,624 : INFO : worker thread finished; awaiting finish of 1 more threads\n", | |
"2020-11-10 02:21:21,655 : INFO : worker thread finished; awaiting finish of 0 more threads\n", | |
"2020-11-10 02:21:21,656 : INFO : EPOCH - 4 : training on 780333 raw words (680112 effective words) took 3.0s, 228708 effective words/s\n", | |
"2020-11-10 02:21:22,666 : INFO : EPOCH 5 - PROGRESS: at 39.71% examples, 196997 words/s, in_qsize 3, out_qsize 0\n", | |
"2020-11-10 02:21:23,743 : INFO : EPOCH 5 - PROGRESS: at 61.34% examples, 176872 words/s, in_qsize 3, out_qsize 0\n", | |
"2020-11-10 02:21:24,759 : INFO : EPOCH 5 - PROGRESS: at 88.41% examples, 188818 words/s, in_qsize 3, out_qsize 0\n", | |
"2020-11-10 02:21:25,134 : INFO : worker thread finished; awaiting finish of 1 more threads\n", | |
"2020-11-10 02:21:25,179 : INFO : worker thread finished; awaiting finish of 0 more threads\n", | |
"2020-11-10 02:21:25,180 : INFO : EPOCH - 5 : training on 780333 raw words (680387 effective words) took 3.5s, 193328 effective words/s\n", | |
"2020-11-10 02:21:25,181 : INFO : training on a 3901665 raw words (3401317 effective words) took 16.6s, 204901 effective words/s\n" | |
] | |
} | |
], | |
"source": [ | |
"import logging\n", | |
"\n", | |
"from gensim.models import Word2Vec, KeyedVectors\n", | |
"from gensim.models.word2vec import LineSentence\n", | |
"\n", | |
"logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", | |
"\n", | |
"lsc = LineSentence('./corpus.tok')\n", | |
"\n", | |
"sg = 0\n", | |
"size = 100\n", | |
"hs = 0\n", | |
"negative = 20\n", | |
"workers = 2\n", | |
"\n", | |
"model = Word2Vec(lsc,\n", | |
" sg=sg,\n", | |
" size=size,\n", | |
" hs=hs,\n", | |
" negative=negative,\n", | |
" workers=workers)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import copy\n", | |
"original_vector = copy.copy(model.wv.vectors)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2020-11-10 02:21:25,230 : INFO : precomputing L2-norms of word weight vectors\n" | |
] | |
} | |
], | |
"source": [ | |
"model.init_sims(replace=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[-0.01522974, -0.12238098, -0.12815583, ..., -0.12898709,\n", | |
" 0.01830324, -0.13048793],\n", | |
" [ 0.11313245, -0.00234493, -0.1835047 , ..., 0.046124 ,\n", | |
" -0.03200746, -0.09186589],\n", | |
" [-0.01789492, -0.06258424, 0.12082538, ..., 0.07070544,\n", | |
" -0.01403479, -0.07681925],\n", | |
" ...,\n", | |
" [ 0.00476433, -0.16728209, -0.06598336, ..., 0.1464505 ,\n", | |
" 0.02849043, -0.05748696],\n", | |
" [-0.02408532, -0.12983347, -0.08509277, ..., 0.13052596,\n", | |
" 0.04662877, -0.09346067],\n", | |
" [ 0.02076837, -0.13136543, -0.01089873, ..., 0.1281885 ,\n", | |
" 0.06908399, -0.0808148 ]], dtype=float32)" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"vectors1 = model.wv.vectors\n", | |
"vectors1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[0.9999999 , 0.38079122, 0.32834882, ..., 0.3337108 , 0.2777639 ,\n", | |
" 0.3528665 ],\n", | |
" [0.38079122, 1. , 0.25065055, ..., 0.36218068, 0.27640563,\n", | |
" 0.38131532],\n", | |
" [0.32834882, 0.25065055, 1.0000004 , ..., 0.28533465, 0.32913357,\n", | |
" 0.30477363],\n", | |
" ...,\n", | |
" [0.3337108 , 0.36218068, 0.28533465, ..., 0.99999976, 0.9267827 ,\n", | |
" 0.9388081 ],\n", | |
" [0.2777639 , 0.27640563, 0.32913357, ..., 0.9267827 , 0.99999976,\n", | |
" 0.86002296],\n", | |
" [0.3528665 , 0.38131532, 0.30477363, ..., 0.9388081 , 0.86002296,\n", | |
" 1. ]], dtype=float32)" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"similarity_matrix1 = vectors1 @ vectors1.T\n", | |
"similarity_matrix1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 0, 129, 141, ..., 221, 382, 512],\n", | |
" [ 1, 2890, 395, ..., 977, 596, 374],\n", | |
" [ 2, 266, 194, ..., 163, 28, 12],\n", | |
" ...,\n", | |
" [10707, 8719, 9374, ..., 27, 13, 9],\n", | |
" [10708, 3829, 4879, ..., 27, 12, 94],\n", | |
" [10709, 4474, 4658, ..., 76, 41, 96]])" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import numpy as np\n", | |
"\n", | |
"similarity_rank1 = np.argsort(similarity_matrix1)[:, ::-1]\n", | |
"similarity_rank1" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"gensim のコサイン類似度の結果と比較" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(True,\n", | |
" [5991, 8060, 10131, 8552, 9070, 7986, 6843, 5736, 7888, 5049],\n", | |
" [5991, 8060, 10131, 8552, 9070, 7986, 6843, 5736, 7888, 5049])" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"(True,\n", | |
" [9212, 6410, 5736, 7400, 9307, 8656, 8181, 10268, 10227, 3506],\n", | |
" [9212, 6410, 5736, 7400, 9307, 8656, 8181, 10268, 10227, 3506])" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"(True,\n", | |
" [5704, 8609, 10034, 10453, 9366, 10159, 8622, 10272, 10263, 7514],\n", | |
" [5704, 8609, 10034, 10453, 9366, 10159, 8622, 10272, 10263, 7514])" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"(True,\n", | |
" [8170, 9874, 5672, 5355, 5262, 3545, 9049, 9142, 5156, 7611],\n", | |
" [8170, 9874, 5672, 5355, 5262, 3545, 9049, 9142, 5156, 7611])" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"(True,\n", | |
" [9347, 9011, 7116, 5349, 10089, 9051, 5597, 7158, 7878, 6094],\n", | |
" [9347, 9011, 7116, 5349, 10089, 9051, 5597, 7158, 7878, 6094])" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"(True,\n", | |
" [5434, 2858, 9925, 4479, 6871, 4622, 7021, 5588, 6336, 5233],\n", | |
" [5434, 2858, 9925, 4479, 6871, 4622, 7021, 5588, 6336, 5233])" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"(True,\n", | |
" [2695, 2224, 3064, 3755, 3093, 3674, 3877, 3062, 5240, 4478],\n", | |
" [2695, 2224, 3064, 3755, 3093, 3674, 3877, 3062, 5240, 4478])" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"(True,\n", | |
" [3942, 3681, 3941, 4529, 3086, 7263, 4101, 3514, 3046, 9907],\n", | |
" [3942, 3681, 3941, 4529, 3086, 7263, 4101, 3514, 3046, 9907])" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"(True,\n", | |
" [9231, 7773, 7851, 8644, 7772, 10682, 8064, 7988, 3911, 9083],\n", | |
" [9231, 7773, 7851, 8644, 7772, 10682, 8064, 7988, 3911, 9083])" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"(True,\n", | |
" [8618, 9078, 6240, 6296, 8963, 7151, 10135, 8629, 10191, 7439],\n", | |
" [8618, 9078, 6240, 6296, 8963, 7151, 10135, 8629, 10191, 7439])" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"for _ in range(10):\n", | |
" sample_word_index = np.random.randint(len(model.wv.vocab))\n", | |
"\n", | |
" a = similarity_rank1[sample_word_index, 1:11].tolist()\n", | |
" b = [model.wv.vocab[s[0]].index for s in model.wv.similar_by_word(model.wv.index2word[sample_word_index], topn=10)]\n", | |
" a == b, a, b" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"正規化前ベクトルを自分で L2 正規化したものと同じであることを確認" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[-0.01522974, -0.12238098, -0.12815583, ..., -0.12898709,\n", | |
" 0.01830324, -0.13048793],\n", | |
" [ 0.11313245, -0.00234493, -0.1835047 , ..., 0.046124 ,\n", | |
" -0.03200746, -0.09186589],\n", | |
" [-0.01789492, -0.06258424, 0.12082538, ..., 0.07070544,\n", | |
" -0.01403479, -0.07681925],\n", | |
" ...,\n", | |
" [ 0.00476433, -0.16728209, -0.06598336, ..., 0.1464505 ,\n", | |
" 0.02849043, -0.05748696],\n", | |
" [-0.02408532, -0.12983347, -0.08509277, ..., 0.13052596,\n", | |
" 0.04662877, -0.09346067],\n", | |
" [ 0.02076837, -0.13136543, -0.01089873, ..., 0.1281885 ,\n", | |
" 0.06908399, -0.0808148 ]], dtype=float32)" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"vectors2 = original_vector / np.linalg.norm(original_vector, axis=1, keepdims=True)\n", | |
"vectors2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"def same_matrix(m1, m2):\n", | |
" return (m1.shape == m2.shape) and all(m1.flatten() == m2.flatten())\n", | |
"\n", | |
"same_matrix(vectors1, vectors2)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment