hiropppe/wv_cosine_similarity_matrix_for_small_data.ipynb

## wv_cosine_similarity_matrix_for_small_data.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = \"all\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import MeCab\n",
    "\n",
    "mecab = MeCab.Tagger('-F\\\\s%f[0][SEP]%f[6] -U\\\\s%m -E\\\\n -b 100000 -d /usr/local/lib/mecab/dic/ipadic')\n",
    "\n",
    "with open('stop_ja.txt') as fsw:\n",
    "    stop_words = frozenset(fsw.read().splitlines())\n",
    "\n",
    "lower = True\n",
    "with open('/data/corpus.txt') as reader, open('./corpus.tok', 'w') as writer:\n",
    "    for line in reader:\n",
    "        tokens = []\n",
    "        for pos_word in mecab.parse(line).strip().split(' '):\n",
    "            if any([pos in pos_word for pos in ('名詞[SEP]', '動詞[SEP]', '形容詞[SEP]', '副詞[SEP]')]):\n",
    "                token = pos_word[pos_word.find('[SEP]')+5:]\n",
    "                tokens.append(token)\n",
    "\n",
    "        tokens = [token.lower() if lower else token for token in tokens if token not in stop_words]\n",
    "\n",
    "        if tokens:\n",
    "            print(' '.join(tokens), file=writer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-11-10 02:21:01,266 : INFO : collecting all words and their counts\n",
      "2020-11-10 02:21:01,268 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n",
      "2020-11-10 02:21:01,661 : INFO : collected 23740 word types from a corpus of 780333 raw words and 7155 sentences\n",
      "2020-11-10 02:21:01,662 : INFO : Loading a fresh vocabulary\n",
      "2020-11-10 02:21:01,708 : INFO : effective_min_count=5 retains 10710 unique words (45% of original 23740, drops 13030)\n",
      "2020-11-10 02:21:01,709 : INFO : effective_min_count=5 leaves 756097 word corpus (96% of original 780333, drops 24236)\n",
      "2020-11-10 02:21:01,776 : INFO : deleting the raw counts dictionary of 23740 items\n",
      "2020-11-10 02:21:01,779 : INFO : sample=0.001 downsamples 30 most-common words\n",
      "2020-11-10 02:21:01,780 : INFO : downsampling leaves estimated 680211 word corpus (90.0% of prior 756097)\n",
      "2020-11-10 02:21:01,827 : INFO : estimated required memory for 10710 words and 100 dimensions: 13923000 bytes\n",
      "2020-11-10 02:21:01,829 : INFO : resetting layer weights\n",
      "2020-11-10 02:21:08,580 : INFO : training model with 2 workers on 10710 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=20 window=5\n",
      "2020-11-10 02:21:09,612 : INFO : EPOCH 1 - PROGRESS: at 39.71% examples, 192691 words/s, in_qsize 3, out_qsize 0\n",
      "2020-11-10 02:21:10,614 : INFO : EPOCH 1 - PROGRESS: at 63.76% examples, 189752 words/s, in_qsize 3, out_qsize 0\n",
      "2020-11-10 02:21:11,650 : INFO : EPOCH 1 - PROGRESS: at 92.38% examples, 199473 words/s, in_qsize 3, out_qsize 0\n",
      "2020-11-10 02:21:11,917 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
      "2020-11-10 02:21:11,960 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
      "2020-11-10 02:21:11,963 : INFO : EPOCH - 1 : training on 780333 raw words (680257 effective words) took 3.4s, 201393 effective words/s\n",
      "2020-11-10 02:21:12,978 : INFO : EPOCH 2 - PROGRESS: at 38.52% examples, 187929 words/s, in_qsize 3, out_qsize 0\n",
      "2020-11-10 02:21:14,015 : INFO : EPOCH 2 - PROGRESS: at 67.55% examples, 205395 words/s, in_qsize 3, out_qsize 0\n",
      "2020-11-10 02:21:15,034 : INFO : EPOCH 2 - PROGRESS: at 97.57% examples, 213650 words/s, in_qsize 3, out_qsize 0\n",
      "2020-11-10 02:21:15,118 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
      "2020-11-10 02:21:15,180 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
      "2020-11-10 02:21:15,182 : INFO : EPOCH - 2 : training on 780333 raw words (680427 effective words) took 3.2s, 211707 effective words/s\n",
      "2020-11-10 02:21:16,204 : INFO : EPOCH 3 - PROGRESS: at 41.45% examples, 204081 words/s, in_qsize 3, out_qsize 0\n",
      "2020-11-10 02:21:17,215 : INFO : EPOCH 3 - PROGRESS: at 65.72% examples, 199128 words/s, in_qsize 3, out_qsize 0\n",
      "2020-11-10 02:21:18,227 : INFO : EPOCH 3 - PROGRESS: at 91.21% examples, 198536 words/s, in_qsize 3, out_qsize 0\n",
      "2020-11-10 02:21:18,613 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
      "2020-11-10 02:21:18,672 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
      "2020-11-10 02:21:18,674 : INFO : EPOCH - 3 : training on 780333 raw words (680134 effective words) took 3.5s, 195380 effective words/s\n",
      "2020-11-10 02:21:19,687 : INFO : EPOCH 4 - PROGRESS: at 41.45% examples, 205383 words/s, in_qsize 3, out_qsize 0\n",
      "2020-11-10 02:21:20,690 : INFO : EPOCH 4 - PROGRESS: at 71.13% examples, 222147 words/s, in_qsize 3, out_qsize 0\n",
      "2020-11-10 02:21:21,624 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
      "2020-11-10 02:21:21,655 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
      "2020-11-10 02:21:21,656 : INFO : EPOCH - 4 : training on 780333 raw words (680112 effective words) took 3.0s, 228708 effective words/s\n",
      "2020-11-10 02:21:22,666 : INFO : EPOCH 5 - PROGRESS: at 39.71% examples, 196997 words/s, in_qsize 3, out_qsize 0\n",
      "2020-11-10 02:21:23,743 : INFO : EPOCH 5 - PROGRESS: at 61.34% examples, 176872 words/s, in_qsize 3, out_qsize 0\n",
      "2020-11-10 02:21:24,759 : INFO : EPOCH 5 - PROGRESS: at 88.41% examples, 188818 words/s, in_qsize 3, out_qsize 0\n",
      "2020-11-10 02:21:25,134 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
      "2020-11-10 02:21:25,179 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
      "2020-11-10 02:21:25,180 : INFO : EPOCH - 5 : training on 780333 raw words (680387 effective words) took 3.5s, 193328 effective words/s\n",
      "2020-11-10 02:21:25,181 : INFO : training on a 3901665 raw words (3401317 effective words) took 16.6s, 204901 effective words/s\n"
     ]
    }
   ],
   "source": [
    "import logging\n",
    "\n",
    "from gensim.models import Word2Vec, KeyedVectors\n",
    "from gensim.models.word2vec import LineSentence\n",
    "\n",
    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n",
    "\n",
    "lsc = LineSentence('./corpus.tok')\n",
    "\n",
    "sg = 0\n",
    "size = 100\n",
    "hs = 0\n",
    "negative = 20\n",
    "workers = 2\n",
    "\n",
    "model = Word2Vec(lsc,\n",
    "                 sg=sg,\n",
    "                 size=size,\n",
    "                 hs=hs,\n",
    "                 negative=negative,\n",
    "                 workers=workers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import copy\n",
    "original_vector = copy.copy(model.wv.vectors)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-11-10 02:21:25,230 : INFO : precomputing L2-norms of word weight vectors\n"
     ]
    }
   ],
   "source": [
    "model.init_sims(replace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-0.01522974, -0.12238098, -0.12815583, ..., -0.12898709,\n",
       "         0.01830324, -0.13048793],\n",
       "       [ 0.11313245, -0.00234493, -0.1835047 , ...,  0.046124  ,\n",
       "        -0.03200746, -0.09186589],\n",
       "       [-0.01789492, -0.06258424,  0.12082538, ...,  0.07070544,\n",
       "        -0.01403479, -0.07681925],\n",
       "       ...,\n",
       "       [ 0.00476433, -0.16728209, -0.06598336, ...,  0.1464505 ,\n",
       "         0.02849043, -0.05748696],\n",
       "       [-0.02408532, -0.12983347, -0.08509277, ...,  0.13052596,\n",
       "         0.04662877, -0.09346067],\n",
       "       [ 0.02076837, -0.13136543, -0.01089873, ...,  0.1281885 ,\n",
       "         0.06908399, -0.0808148 ]], dtype=float32)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectors1 = model.wv.vectors\n",
    "vectors1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.9999999 , 0.38079122, 0.32834882, ..., 0.3337108 , 0.2777639 ,\n",
       "        0.3528665 ],\n",
       "       [0.38079122, 1.        , 0.25065055, ..., 0.36218068, 0.27640563,\n",
       "        0.38131532],\n",
       "       [0.32834882, 0.25065055, 1.0000004 , ..., 0.28533465, 0.32913357,\n",
       "        0.30477363],\n",
       "       ...,\n",
       "       [0.3337108 , 0.36218068, 0.28533465, ..., 0.99999976, 0.9267827 ,\n",
       "        0.9388081 ],\n",
       "       [0.2777639 , 0.27640563, 0.32913357, ..., 0.9267827 , 0.99999976,\n",
       "        0.86002296],\n",
       "       [0.3528665 , 0.38131532, 0.30477363, ..., 0.9388081 , 0.86002296,\n",
       "        1.        ]], dtype=float32)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "similarity_matrix1 = vectors1 @ vectors1.T\n",
    "similarity_matrix1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[    0,   129,   141, ...,   221,   382,   512],\n",
       "       [    1,  2890,   395, ...,   977,   596,   374],\n",
       "       [    2,   266,   194, ...,   163,    28,    12],\n",
       "       ...,\n",
       "       [10707,  8719,  9374, ...,    27,    13,     9],\n",
       "       [10708,  3829,  4879, ...,    27,    12,    94],\n",
       "       [10709,  4474,  4658, ...,    76,    41,    96]])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "similarity_rank1 = np.argsort(similarity_matrix1)[:, ::-1]\n",
    "similarity_rank1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "gensim のコサイン類似度の結果と比較"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(True,\n",
       " [5991, 8060, 10131, 8552, 9070, 7986, 6843, 5736, 7888, 5049],\n",
       " [5991, 8060, 10131, 8552, 9070, 7986, 6843, 5736, 7888, 5049])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "(True,\n",
       " [9212, 6410, 5736, 7400, 9307, 8656, 8181, 10268, 10227, 3506],\n",
       " [9212, 6410, 5736, 7400, 9307, 8656, 8181, 10268, 10227, 3506])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "(True,\n",
       " [5704, 8609, 10034, 10453, 9366, 10159, 8622, 10272, 10263, 7514],\n",
       " [5704, 8609, 10034, 10453, 9366, 10159, 8622, 10272, 10263, 7514])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "(True,\n",
       " [8170, 9874, 5672, 5355, 5262, 3545, 9049, 9142, 5156, 7611],\n",
       " [8170, 9874, 5672, 5355, 5262, 3545, 9049, 9142, 5156, 7611])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "(True,\n",
       " [9347, 9011, 7116, 5349, 10089, 9051, 5597, 7158, 7878, 6094],\n",
       " [9347, 9011, 7116, 5349, 10089, 9051, 5597, 7158, 7878, 6094])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "(True,\n",
       " [5434, 2858, 9925, 4479, 6871, 4622, 7021, 5588, 6336, 5233],\n",
       " [5434, 2858, 9925, 4479, 6871, 4622, 7021, 5588, 6336, 5233])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "(True,\n",
       " [2695, 2224, 3064, 3755, 3093, 3674, 3877, 3062, 5240, 4478],\n",
       " [2695, 2224, 3064, 3755, 3093, 3674, 3877, 3062, 5240, 4478])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "(True,\n",
       " [3942, 3681, 3941, 4529, 3086, 7263, 4101, 3514, 3046, 9907],\n",
       " [3942, 3681, 3941, 4529, 3086, 7263, 4101, 3514, 3046, 9907])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "(True,\n",
       " [9231, 7773, 7851, 8644, 7772, 10682, 8064, 7988, 3911, 9083],\n",
       " [9231, 7773, 7851, 8644, 7772, 10682, 8064, 7988, 3911, 9083])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "(True,\n",
       " [8618, 9078, 6240, 6296, 8963, 7151, 10135, 8629, 10191, 7439],\n",
       " [8618, 9078, 6240, 6296, 8963, 7151, 10135, 8629, 10191, 7439])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for _ in range(10):\n",
    "    sample_word_index = np.random.randint(len(model.wv.vocab))\n",
    "\n",
    "    a = similarity_rank1[sample_word_index, 1:11].tolist()\n",
    "    b = [model.wv.vocab[s[0]].index for s in model.wv.similar_by_word(model.wv.index2word[sample_word_index], topn=10)]\n",
    "    a == b, a, b"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "正規化前ベクトルを自分で L2 正規化したものと同じであることを確認"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-0.01522974, -0.12238098, -0.12815583, ..., -0.12898709,\n",
       "         0.01830324, -0.13048793],\n",
       "       [ 0.11313245, -0.00234493, -0.1835047 , ...,  0.046124  ,\n",
       "        -0.03200746, -0.09186589],\n",
       "       [-0.01789492, -0.06258424,  0.12082538, ...,  0.07070544,\n",
       "        -0.01403479, -0.07681925],\n",
       "       ...,\n",
       "       [ 0.00476433, -0.16728209, -0.06598336, ...,  0.1464505 ,\n",
       "         0.02849043, -0.05748696],\n",
       "       [-0.02408532, -0.12983347, -0.08509277, ...,  0.13052596,\n",
       "         0.04662877, -0.09346067],\n",
       "       [ 0.02076837, -0.13136543, -0.01089873, ...,  0.1281885 ,\n",
       "         0.06908399, -0.0808148 ]], dtype=float32)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectors2 = original_vector / np.linalg.norm(original_vector, axis=1, keepdims=True)\n",
    "vectors2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def same_matrix(m1, m2):\n",
    "    return (m1.shape == m2.shape) and all(m1.flatten() == m2.flatten())\n",
    "\n",
    "same_matrix(vectors1, vectors2)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"from IPython.core.interactiveshell import InteractiveShell\n",
	"InteractiveShell.ast_node_interactivity = \"all\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"import MeCab\n",
	"\n",
	"mecab = MeCab.Tagger('-F\\\\s%f[0][SEP]%f[6] -U\\\\s%m -E\\\\n -b 100000 -d /usr/local/lib/mecab/dic/ipadic')\n",
	"\n",
	"with open('stop_ja.txt') as fsw:\n",
	" stop_words = frozenset(fsw.read().splitlines())\n",
	"\n",
	"lower = True\n",
	"with open('/data/corpus.txt') as reader, open('./corpus.tok', 'w') as writer:\n",
	" for line in reader:\n",
	" tokens = []\n",
	" for pos_word in mecab.parse(line).strip().split(' '):\n",
	" if any([pos in pos_word for pos in ('名詞[SEP]', '動詞[SEP]', '形容詞[SEP]', '副詞[SEP]')]):\n",
	" token = pos_word[pos_word.find('[SEP]')+5:]\n",
	" tokens.append(token)\n",
	"\n",
	" tokens = [token.lower() if lower else token for token in tokens if token not in stop_words]\n",
	"\n",
	" if tokens:\n",
	" print(' '.join(tokens), file=writer)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"2020-11-10 02:21:01,266 : INFO : collecting all words and their counts\n",
	"2020-11-10 02:21:01,268 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n",
	"2020-11-10 02:21:01,661 : INFO : collected 23740 word types from a corpus of 780333 raw words and 7155 sentences\n",
	"2020-11-10 02:21:01,662 : INFO : Loading a fresh vocabulary\n",
	"2020-11-10 02:21:01,708 : INFO : effective_min_count=5 retains 10710 unique words (45% of original 23740, drops 13030)\n",
	"2020-11-10 02:21:01,709 : INFO : effective_min_count=5 leaves 756097 word corpus (96% of original 780333, drops 24236)\n",
	"2020-11-10 02:21:01,776 : INFO : deleting the raw counts dictionary of 23740 items\n",
	"2020-11-10 02:21:01,779 : INFO : sample=0.001 downsamples 30 most-common words\n",
	"2020-11-10 02:21:01,780 : INFO : downsampling leaves estimated 680211 word corpus (90.0% of prior 756097)\n",
	"2020-11-10 02:21:01,827 : INFO : estimated required memory for 10710 words and 100 dimensions: 13923000 bytes\n",
	"2020-11-10 02:21:01,829 : INFO : resetting layer weights\n",
	"2020-11-10 02:21:08,580 : INFO : training model with 2 workers on 10710 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=20 window=5\n",
	"2020-11-10 02:21:09,612 : INFO : EPOCH 1 - PROGRESS: at 39.71% examples, 192691 words/s, in_qsize 3, out_qsize 0\n",
	"2020-11-10 02:21:10,614 : INFO : EPOCH 1 - PROGRESS: at 63.76% examples, 189752 words/s, in_qsize 3, out_qsize 0\n",
	"2020-11-10 02:21:11,650 : INFO : EPOCH 1 - PROGRESS: at 92.38% examples, 199473 words/s, in_qsize 3, out_qsize 0\n",
	"2020-11-10 02:21:11,917 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
	"2020-11-10 02:21:11,960 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
	"2020-11-10 02:21:11,963 : INFO : EPOCH - 1 : training on 780333 raw words (680257 effective words) took 3.4s, 201393 effective words/s\n",
	"2020-11-10 02:21:12,978 : INFO : EPOCH 2 - PROGRESS: at 38.52% examples, 187929 words/s, in_qsize 3, out_qsize 0\n",
	"2020-11-10 02:21:14,015 : INFO : EPOCH 2 - PROGRESS: at 67.55% examples, 205395 words/s, in_qsize 3, out_qsize 0\n",
	"2020-11-10 02:21:15,034 : INFO : EPOCH 2 - PROGRESS: at 97.57% examples, 213650 words/s, in_qsize 3, out_qsize 0\n",
	"2020-11-10 02:21:15,118 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
	"2020-11-10 02:21:15,180 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
	"2020-11-10 02:21:15,182 : INFO : EPOCH - 2 : training on 780333 raw words (680427 effective words) took 3.2s, 211707 effective words/s\n",
	"2020-11-10 02:21:16,204 : INFO : EPOCH 3 - PROGRESS: at 41.45% examples, 204081 words/s, in_qsize 3, out_qsize 0\n",
	"2020-11-10 02:21:17,215 : INFO : EPOCH 3 - PROGRESS: at 65.72% examples, 199128 words/s, in_qsize 3, out_qsize 0\n",
	"2020-11-10 02:21:18,227 : INFO : EPOCH 3 - PROGRESS: at 91.21% examples, 198536 words/s, in_qsize 3, out_qsize 0\n",
	"2020-11-10 02:21:18,613 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
	"2020-11-10 02:21:18,672 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
	"2020-11-10 02:21:18,674 : INFO : EPOCH - 3 : training on 780333 raw words (680134 effective words) took 3.5s, 195380 effective words/s\n",
	"2020-11-10 02:21:19,687 : INFO : EPOCH 4 - PROGRESS: at 41.45% examples, 205383 words/s, in_qsize 3, out_qsize 0\n",
	"2020-11-10 02:21:20,690 : INFO : EPOCH 4 - PROGRESS: at 71.13% examples, 222147 words/s, in_qsize 3, out_qsize 0\n",
	"2020-11-10 02:21:21,624 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
	"2020-11-10 02:21:21,655 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
	"2020-11-10 02:21:21,656 : INFO : EPOCH - 4 : training on 780333 raw words (680112 effective words) took 3.0s, 228708 effective words/s\n",
	"2020-11-10 02:21:22,666 : INFO : EPOCH 5 - PROGRESS: at 39.71% examples, 196997 words/s, in_qsize 3, out_qsize 0\n",
	"2020-11-10 02:21:23,743 : INFO : EPOCH 5 - PROGRESS: at 61.34% examples, 176872 words/s, in_qsize 3, out_qsize 0\n",
	"2020-11-10 02:21:24,759 : INFO : EPOCH 5 - PROGRESS: at 88.41% examples, 188818 words/s, in_qsize 3, out_qsize 0\n",
	"2020-11-10 02:21:25,134 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
	"2020-11-10 02:21:25,179 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
	"2020-11-10 02:21:25,180 : INFO : EPOCH - 5 : training on 780333 raw words (680387 effective words) took 3.5s, 193328 effective words/s\n",
	"2020-11-10 02:21:25,181 : INFO : training on a 3901665 raw words (3401317 effective words) took 16.6s, 204901 effective words/s\n"
	]
	}
	],
	"source": [
	"import logging\n",
	"\n",
	"from gensim.models import Word2Vec, KeyedVectors\n",
	"from gensim.models.word2vec import LineSentence\n",
	"\n",
	"logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n",
	"\n",
	"lsc = LineSentence('./corpus.tok')\n",
	"\n",
	"sg = 0\n",
	"size = 100\n",
	"hs = 0\n",
	"negative = 20\n",
	"workers = 2\n",
	"\n",
	"model = Word2Vec(lsc,\n",
	" sg=sg,\n",
	" size=size,\n",
	" hs=hs,\n",
	" negative=negative,\n",
	" workers=workers)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"import copy\n",
	"original_vector = copy.copy(model.wv.vectors)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"2020-11-10 02:21:25,230 : INFO : precomputing L2-norms of word weight vectors\n"
	]
	}
	],
	"source": [
	"model.init_sims(replace=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[-0.01522974, -0.12238098, -0.12815583, ..., -0.12898709,\n",
	" 0.01830324, -0.13048793],\n",
	" [ 0.11313245, -0.00234493, -0.1835047 , ..., 0.046124 ,\n",
	" -0.03200746, -0.09186589],\n",
	" [-0.01789492, -0.06258424, 0.12082538, ..., 0.07070544,\n",
	" -0.01403479, -0.07681925],\n",
	" ...,\n",
	" [ 0.00476433, -0.16728209, -0.06598336, ..., 0.1464505 ,\n",
	" 0.02849043, -0.05748696],\n",
	" [-0.02408532, -0.12983347, -0.08509277, ..., 0.13052596,\n",
	" 0.04662877, -0.09346067],\n",
	" [ 0.02076837, -0.13136543, -0.01089873, ..., 0.1281885 ,\n",
	" 0.06908399, -0.0808148 ]], dtype=float32)"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"vectors1 = model.wv.vectors\n",
	"vectors1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[0.9999999 , 0.38079122, 0.32834882, ..., 0.3337108 , 0.2777639 ,\n",
	" 0.3528665 ],\n",
	" [0.38079122, 1. , 0.25065055, ..., 0.36218068, 0.27640563,\n",
	" 0.38131532],\n",
	" [0.32834882, 0.25065055, 1.0000004 , ..., 0.28533465, 0.32913357,\n",
	" 0.30477363],\n",
	" ...,\n",
	" [0.3337108 , 0.36218068, 0.28533465, ..., 0.99999976, 0.9267827 ,\n",
	" 0.9388081 ],\n",
	" [0.2777639 , 0.27640563, 0.32913357, ..., 0.9267827 , 0.99999976,\n",
	" 0.86002296],\n",
	" [0.3528665 , 0.38131532, 0.30477363, ..., 0.9388081 , 0.86002296,\n",
	" 1. ]], dtype=float32)"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"similarity_matrix1 = vectors1 @ vectors1.T\n",
	"similarity_matrix1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[ 0, 129, 141, ..., 221, 382, 512],\n",
	" [ 1, 2890, 395, ..., 977, 596, 374],\n",
	" [ 2, 266, 194, ..., 163, 28, 12],\n",
	" ...,\n",
	" [10707, 8719, 9374, ..., 27, 13, 9],\n",
	" [10708, 3829, 4879, ..., 27, 12, 94],\n",
	" [10709, 4474, 4658, ..., 76, 41, 96]])"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import numpy as np\n",
	"\n",
	"similarity_rank1 = np.argsort(similarity_matrix1)[:, ::-1]\n",
	"similarity_rank1"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"gensim のコサイン類似度の結果と比較"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(True,\n",
	" [5991, 8060, 10131, 8552, 9070, 7986, 6843, 5736, 7888, 5049],\n",
	" [5991, 8060, 10131, 8552, 9070, 7986, 6843, 5736, 7888, 5049])"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	},
	{
	"data": {
	"text/plain": [
	"(True,\n",
	" [9212, 6410, 5736, 7400, 9307, 8656, 8181, 10268, 10227, 3506],\n",
	" [9212, 6410, 5736, 7400, 9307, 8656, 8181, 10268, 10227, 3506])"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	},
	{
	"data": {
	"text/plain": [
	"(True,\n",
	" [5704, 8609, 10034, 10453, 9366, 10159, 8622, 10272, 10263, 7514],\n",
	" [5704, 8609, 10034, 10453, 9366, 10159, 8622, 10272, 10263, 7514])"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	},
	{
	"data": {
	"text/plain": [
	"(True,\n",
	" [8170, 9874, 5672, 5355, 5262, 3545, 9049, 9142, 5156, 7611],\n",
	" [8170, 9874, 5672, 5355, 5262, 3545, 9049, 9142, 5156, 7611])"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	},
	{
	"data": {
	"text/plain": [
	"(True,\n",
	" [9347, 9011, 7116, 5349, 10089, 9051, 5597, 7158, 7878, 6094],\n",
	" [9347, 9011, 7116, 5349, 10089, 9051, 5597, 7158, 7878, 6094])"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	},
	{
	"data": {
	"text/plain": [
	"(True,\n",
	" [5434, 2858, 9925, 4479, 6871, 4622, 7021, 5588, 6336, 5233],\n",
	" [5434, 2858, 9925, 4479, 6871, 4622, 7021, 5588, 6336, 5233])"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	},
	{
	"data": {
	"text/plain": [
	"(True,\n",
	" [2695, 2224, 3064, 3755, 3093, 3674, 3877, 3062, 5240, 4478],\n",
	" [2695, 2224, 3064, 3755, 3093, 3674, 3877, 3062, 5240, 4478])"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	},
	{
	"data": {
	"text/plain": [
	"(True,\n",
	" [3942, 3681, 3941, 4529, 3086, 7263, 4101, 3514, 3046, 9907],\n",
	" [3942, 3681, 3941, 4529, 3086, 7263, 4101, 3514, 3046, 9907])"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	},
	{
	"data": {
	"text/plain": [
	"(True,\n",
	" [9231, 7773, 7851, 8644, 7772, 10682, 8064, 7988, 3911, 9083],\n",
	" [9231, 7773, 7851, 8644, 7772, 10682, 8064, 7988, 3911, 9083])"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	},
	{
	"data": {
	"text/plain": [
	"(True,\n",
	" [8618, 9078, 6240, 6296, 8963, 7151, 10135, 8629, 10191, 7439],\n",
	" [8618, 9078, 6240, 6296, 8963, 7151, 10135, 8629, 10191, 7439])"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"for _ in range(10):\n",
	" sample_word_index = np.random.randint(len(model.wv.vocab))\n",
	"\n",
	" a = similarity_rank1[sample_word_index, 1:11].tolist()\n",
	" b = [model.wv.vocab[s[0]].index for s in model.wv.similar_by_word(model.wv.index2word[sample_word_index], topn=10)]\n",
	" a == b, a, b"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"正規化前ベクトルを自分で L2 正規化したものと同じであることを確認"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[-0.01522974, -0.12238098, -0.12815583, ..., -0.12898709,\n",
	" 0.01830324, -0.13048793],\n",
	" [ 0.11313245, -0.00234493, -0.1835047 , ..., 0.046124 ,\n",
	" -0.03200746, -0.09186589],\n",
	" [-0.01789492, -0.06258424, 0.12082538, ..., 0.07070544,\n",
	" -0.01403479, -0.07681925],\n",
	" ...,\n",
	" [ 0.00476433, -0.16728209, -0.06598336, ..., 0.1464505 ,\n",
	" 0.02849043, -0.05748696],\n",
	" [-0.02408532, -0.12983347, -0.08509277, ..., 0.13052596,\n",
	" 0.04662877, -0.09346067],\n",
	" [ 0.02076837, -0.13136543, -0.01089873, ..., 0.1281885 ,\n",
	" 0.06908399, -0.0808148 ]], dtype=float32)"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"vectors2 = original_vector / np.linalg.norm(original_vector, axis=1, keepdims=True)\n",
	"vectors2"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"True"
	]
	},
	"execution_count": 11,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"def same_matrix(m1, m2):\n",
	" return (m1.shape == m2.shape) and all(m1.flatten() == m2.flatten())\n",
	"\n",
	"same_matrix(vectors1, vectors2)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.4"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}