Skip to content

Instantly share code, notes, and snippets.

@hiropppe
Last active December 25, 2020 17:56
Show Gist options
  • Save hiropppe/cf874500822e273101b162e373bcba13 to your computer and use it in GitHub Desktop.
Save hiropppe/cf874500822e273101b162e373bcba13 to your computer and use it in GitHub Desktop.
Gensim で作成した小さめの wv からコサイン類似度行列の作り方
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from IPython.core.interactiveshell import InteractiveShell\n",
"InteractiveShell.ast_node_interactivity = \"all\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import MeCab\n",
"\n",
"mecab = MeCab.Tagger('-F\\\\s%f[0][SEP]%f[6] -U\\\\s%m -E\\\\n -b 100000 -d /usr/local/lib/mecab/dic/ipadic')\n",
"\n",
"with open('stop_ja.txt') as fsw:\n",
" stop_words = frozenset(fsw.read().splitlines())\n",
"\n",
"lower = True\n",
"with open('/data/corpus.txt') as reader, open('./corpus.tok', 'w') as writer:\n",
" for line in reader:\n",
" tokens = []\n",
" for pos_word in mecab.parse(line).strip().split(' '):\n",
" if any([pos in pos_word for pos in ('名詞[SEP]', '動詞[SEP]', '形容詞[SEP]', '副詞[SEP]')]):\n",
" token = pos_word[pos_word.find('[SEP]')+5:]\n",
" tokens.append(token)\n",
"\n",
" tokens = [token.lower() if lower else token for token in tokens if token not in stop_words]\n",
"\n",
" if tokens:\n",
" print(' '.join(tokens), file=writer)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2020-11-10 02:21:01,266 : INFO : collecting all words and their counts\n",
"2020-11-10 02:21:01,268 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n",
"2020-11-10 02:21:01,661 : INFO : collected 23740 word types from a corpus of 780333 raw words and 7155 sentences\n",
"2020-11-10 02:21:01,662 : INFO : Loading a fresh vocabulary\n",
"2020-11-10 02:21:01,708 : INFO : effective_min_count=5 retains 10710 unique words (45% of original 23740, drops 13030)\n",
"2020-11-10 02:21:01,709 : INFO : effective_min_count=5 leaves 756097 word corpus (96% of original 780333, drops 24236)\n",
"2020-11-10 02:21:01,776 : INFO : deleting the raw counts dictionary of 23740 items\n",
"2020-11-10 02:21:01,779 : INFO : sample=0.001 downsamples 30 most-common words\n",
"2020-11-10 02:21:01,780 : INFO : downsampling leaves estimated 680211 word corpus (90.0% of prior 756097)\n",
"2020-11-10 02:21:01,827 : INFO : estimated required memory for 10710 words and 100 dimensions: 13923000 bytes\n",
"2020-11-10 02:21:01,829 : INFO : resetting layer weights\n",
"2020-11-10 02:21:08,580 : INFO : training model with 2 workers on 10710 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=20 window=5\n",
"2020-11-10 02:21:09,612 : INFO : EPOCH 1 - PROGRESS: at 39.71% examples, 192691 words/s, in_qsize 3, out_qsize 0\n",
"2020-11-10 02:21:10,614 : INFO : EPOCH 1 - PROGRESS: at 63.76% examples, 189752 words/s, in_qsize 3, out_qsize 0\n",
"2020-11-10 02:21:11,650 : INFO : EPOCH 1 - PROGRESS: at 92.38% examples, 199473 words/s, in_qsize 3, out_qsize 0\n",
"2020-11-10 02:21:11,917 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
"2020-11-10 02:21:11,960 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
"2020-11-10 02:21:11,963 : INFO : EPOCH - 1 : training on 780333 raw words (680257 effective words) took 3.4s, 201393 effective words/s\n",
"2020-11-10 02:21:12,978 : INFO : EPOCH 2 - PROGRESS: at 38.52% examples, 187929 words/s, in_qsize 3, out_qsize 0\n",
"2020-11-10 02:21:14,015 : INFO : EPOCH 2 - PROGRESS: at 67.55% examples, 205395 words/s, in_qsize 3, out_qsize 0\n",
"2020-11-10 02:21:15,034 : INFO : EPOCH 2 - PROGRESS: at 97.57% examples, 213650 words/s, in_qsize 3, out_qsize 0\n",
"2020-11-10 02:21:15,118 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
"2020-11-10 02:21:15,180 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
"2020-11-10 02:21:15,182 : INFO : EPOCH - 2 : training on 780333 raw words (680427 effective words) took 3.2s, 211707 effective words/s\n",
"2020-11-10 02:21:16,204 : INFO : EPOCH 3 - PROGRESS: at 41.45% examples, 204081 words/s, in_qsize 3, out_qsize 0\n",
"2020-11-10 02:21:17,215 : INFO : EPOCH 3 - PROGRESS: at 65.72% examples, 199128 words/s, in_qsize 3, out_qsize 0\n",
"2020-11-10 02:21:18,227 : INFO : EPOCH 3 - PROGRESS: at 91.21% examples, 198536 words/s, in_qsize 3, out_qsize 0\n",
"2020-11-10 02:21:18,613 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
"2020-11-10 02:21:18,672 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
"2020-11-10 02:21:18,674 : INFO : EPOCH - 3 : training on 780333 raw words (680134 effective words) took 3.5s, 195380 effective words/s\n",
"2020-11-10 02:21:19,687 : INFO : EPOCH 4 - PROGRESS: at 41.45% examples, 205383 words/s, in_qsize 3, out_qsize 0\n",
"2020-11-10 02:21:20,690 : INFO : EPOCH 4 - PROGRESS: at 71.13% examples, 222147 words/s, in_qsize 3, out_qsize 0\n",
"2020-11-10 02:21:21,624 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
"2020-11-10 02:21:21,655 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
"2020-11-10 02:21:21,656 : INFO : EPOCH - 4 : training on 780333 raw words (680112 effective words) took 3.0s, 228708 effective words/s\n",
"2020-11-10 02:21:22,666 : INFO : EPOCH 5 - PROGRESS: at 39.71% examples, 196997 words/s, in_qsize 3, out_qsize 0\n",
"2020-11-10 02:21:23,743 : INFO : EPOCH 5 - PROGRESS: at 61.34% examples, 176872 words/s, in_qsize 3, out_qsize 0\n",
"2020-11-10 02:21:24,759 : INFO : EPOCH 5 - PROGRESS: at 88.41% examples, 188818 words/s, in_qsize 3, out_qsize 0\n",
"2020-11-10 02:21:25,134 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
"2020-11-10 02:21:25,179 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
"2020-11-10 02:21:25,180 : INFO : EPOCH - 5 : training on 780333 raw words (680387 effective words) took 3.5s, 193328 effective words/s\n",
"2020-11-10 02:21:25,181 : INFO : training on a 3901665 raw words (3401317 effective words) took 16.6s, 204901 effective words/s\n"
]
}
],
"source": [
"import logging\n",
"\n",
"from gensim.models import Word2Vec, KeyedVectors\n",
"from gensim.models.word2vec import LineSentence\n",
"\n",
"logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n",
"\n",
"lsc = LineSentence('./corpus.tok')\n",
"\n",
"sg = 0\n",
"size = 100\n",
"hs = 0\n",
"negative = 20\n",
"workers = 2\n",
"\n",
"model = Word2Vec(lsc,\n",
" sg=sg,\n",
" size=size,\n",
" hs=hs,\n",
" negative=negative,\n",
" workers=workers)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import copy\n",
"original_vector = copy.copy(model.wv.vectors)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2020-11-10 02:21:25,230 : INFO : precomputing L2-norms of word weight vectors\n"
]
}
],
"source": [
"model.init_sims(replace=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[-0.01522974, -0.12238098, -0.12815583, ..., -0.12898709,\n",
" 0.01830324, -0.13048793],\n",
" [ 0.11313245, -0.00234493, -0.1835047 , ..., 0.046124 ,\n",
" -0.03200746, -0.09186589],\n",
" [-0.01789492, -0.06258424, 0.12082538, ..., 0.07070544,\n",
" -0.01403479, -0.07681925],\n",
" ...,\n",
" [ 0.00476433, -0.16728209, -0.06598336, ..., 0.1464505 ,\n",
" 0.02849043, -0.05748696],\n",
" [-0.02408532, -0.12983347, -0.08509277, ..., 0.13052596,\n",
" 0.04662877, -0.09346067],\n",
" [ 0.02076837, -0.13136543, -0.01089873, ..., 0.1281885 ,\n",
" 0.06908399, -0.0808148 ]], dtype=float32)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vectors1 = model.wv.vectors\n",
"vectors1"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0.9999999 , 0.38079122, 0.32834882, ..., 0.3337108 , 0.2777639 ,\n",
" 0.3528665 ],\n",
" [0.38079122, 1. , 0.25065055, ..., 0.36218068, 0.27640563,\n",
" 0.38131532],\n",
" [0.32834882, 0.25065055, 1.0000004 , ..., 0.28533465, 0.32913357,\n",
" 0.30477363],\n",
" ...,\n",
" [0.3337108 , 0.36218068, 0.28533465, ..., 0.99999976, 0.9267827 ,\n",
" 0.9388081 ],\n",
" [0.2777639 , 0.27640563, 0.32913357, ..., 0.9267827 , 0.99999976,\n",
" 0.86002296],\n",
" [0.3528665 , 0.38131532, 0.30477363, ..., 0.9388081 , 0.86002296,\n",
" 1. ]], dtype=float32)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"similarity_matrix1 = vectors1 @ vectors1.T\n",
"similarity_matrix1"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0, 129, 141, ..., 221, 382, 512],\n",
" [ 1, 2890, 395, ..., 977, 596, 374],\n",
" [ 2, 266, 194, ..., 163, 28, 12],\n",
" ...,\n",
" [10707, 8719, 9374, ..., 27, 13, 9],\n",
" [10708, 3829, 4879, ..., 27, 12, 94],\n",
" [10709, 4474, 4658, ..., 76, 41, 96]])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"\n",
"similarity_rank1 = np.argsort(similarity_matrix1)[:, ::-1]\n",
"similarity_rank1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"gensim のコサイン類似度の結果と比較"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(True,\n",
" [5991, 8060, 10131, 8552, 9070, 7986, 6843, 5736, 7888, 5049],\n",
" [5991, 8060, 10131, 8552, 9070, 7986, 6843, 5736, 7888, 5049])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"(True,\n",
" [9212, 6410, 5736, 7400, 9307, 8656, 8181, 10268, 10227, 3506],\n",
" [9212, 6410, 5736, 7400, 9307, 8656, 8181, 10268, 10227, 3506])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"(True,\n",
" [5704, 8609, 10034, 10453, 9366, 10159, 8622, 10272, 10263, 7514],\n",
" [5704, 8609, 10034, 10453, 9366, 10159, 8622, 10272, 10263, 7514])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"(True,\n",
" [8170, 9874, 5672, 5355, 5262, 3545, 9049, 9142, 5156, 7611],\n",
" [8170, 9874, 5672, 5355, 5262, 3545, 9049, 9142, 5156, 7611])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"(True,\n",
" [9347, 9011, 7116, 5349, 10089, 9051, 5597, 7158, 7878, 6094],\n",
" [9347, 9011, 7116, 5349, 10089, 9051, 5597, 7158, 7878, 6094])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"(True,\n",
" [5434, 2858, 9925, 4479, 6871, 4622, 7021, 5588, 6336, 5233],\n",
" [5434, 2858, 9925, 4479, 6871, 4622, 7021, 5588, 6336, 5233])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"(True,\n",
" [2695, 2224, 3064, 3755, 3093, 3674, 3877, 3062, 5240, 4478],\n",
" [2695, 2224, 3064, 3755, 3093, 3674, 3877, 3062, 5240, 4478])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"(True,\n",
" [3942, 3681, 3941, 4529, 3086, 7263, 4101, 3514, 3046, 9907],\n",
" [3942, 3681, 3941, 4529, 3086, 7263, 4101, 3514, 3046, 9907])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"(True,\n",
" [9231, 7773, 7851, 8644, 7772, 10682, 8064, 7988, 3911, 9083],\n",
" [9231, 7773, 7851, 8644, 7772, 10682, 8064, 7988, 3911, 9083])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"(True,\n",
" [8618, 9078, 6240, 6296, 8963, 7151, 10135, 8629, 10191, 7439],\n",
" [8618, 9078, 6240, 6296, 8963, 7151, 10135, 8629, 10191, 7439])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"for _ in range(10):\n",
" sample_word_index = np.random.randint(len(model.wv.vocab))\n",
"\n",
" a = similarity_rank1[sample_word_index, 1:11].tolist()\n",
" b = [model.wv.vocab[s[0]].index for s in model.wv.similar_by_word(model.wv.index2word[sample_word_index], topn=10)]\n",
" a == b, a, b"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"正規化前ベクトルを自分で L2 正規化したものと同じであることを確認"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([[-0.01522974, -0.12238098, -0.12815583, ..., -0.12898709,\n",
" 0.01830324, -0.13048793],\n",
" [ 0.11313245, -0.00234493, -0.1835047 , ..., 0.046124 ,\n",
" -0.03200746, -0.09186589],\n",
" [-0.01789492, -0.06258424, 0.12082538, ..., 0.07070544,\n",
" -0.01403479, -0.07681925],\n",
" ...,\n",
" [ 0.00476433, -0.16728209, -0.06598336, ..., 0.1464505 ,\n",
" 0.02849043, -0.05748696],\n",
" [-0.02408532, -0.12983347, -0.08509277, ..., 0.13052596,\n",
" 0.04662877, -0.09346067],\n",
" [ 0.02076837, -0.13136543, -0.01089873, ..., 0.1281885 ,\n",
" 0.06908399, -0.0808148 ]], dtype=float32)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vectors2 = original_vector / np.linalg.norm(original_vector, axis=1, keepdims=True)\n",
"vectors2"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def same_matrix(m1, m2):\n",
" return (m1.shape == m2.shape) and all(m1.flatten() == m2.flatten())\n",
"\n",
"same_matrix(vectors1, vectors2)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment