koorukuroo/basic_w2v.ipynb

## basic_w2v.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- https://github.com/tensorflow/tensorflow/blob/r1.3/tensorflow/examples/tutorials/word2vec/word2vec_basic.py\n",
    "- https://djsaunde.github.io/word2vec.html\n",
    "- http://khanrc.tistory.com/entry/TensorFlow-7-word2vec-Implementation\n",
    "- http://solarisailab.com/archives/374\n",
    "- https://github.com/danielfrg/word2vec/blob/master/examples/word2vec.ipynb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import zipfile\n",
    "import tensorflow as tf\n",
    "\n",
    "import collections\n",
    "import numpy as np\n",
    "import random\n",
    "\n",
    "import math"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found and verified text8.zip\n",
      "data size: 17005207\n"
     ]
    }
   ],
   "source": [
    "# download the data\n",
    "url = 'http://mattmahoney.net/dc/'\n",
    "\n",
    "def download(filename, expected_bytes):\n",
    "    \"\"\"\n",
    "    Download a file if not present, and make sure it's the right size.\n",
    "    \"\"\"\n",
    "    if not os.path.exists(filename):\n",
    "        filename, _ = urllib.request.urlretrieve(url + filename, filename)\n",
    "    statinfo = os.stat(filename)\n",
    "    if statinfo.st_size == expected_bytes:\n",
    "        print('Found and verified', filename)\n",
    "    else:\n",
    "        print(statinfo.st_size)\n",
    "        raise Exception('Failed to verify ' + filename + '. Can you get to it with a browser?')\n",
    "    return filename\n",
    "\n",
    "# download the file\n",
    "file_ = download('text8.zip', 31344016)\n",
    "\n",
    "def read_data(filename):\n",
    "    \"\"\"\n",
    "    Parse the file enclosed in the 'filename' zip file into a list of words.\n",
    "    \"\"\"\n",
    "    # unzip the file\n",
    "    with zipfile.ZipFile(filename) as f:\n",
    "        # read the data into the 'data' variable\n",
    "        data = tf.compat.as_str(f.read(f.namelist()[0])).split()\n",
    "    # return the data\n",
    "    return data\n",
    "\n",
    "words = read_data(file_)\n",
    "print('data size:', len(words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "most common words (+UNK): [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764), ('in', 372201), ('a', 325873), ('to', 316376), ('zero', 264975), ('nine', 250430)]\n",
      "sample data: [5239, 3081, 12, 6, 195, 2, 3137, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']\n"
     ]
    }
   ],
   "source": [
    "# build the dictionary and replace rare words with the \"UNK\" token.\n",
    "vocabulary_size = 50000\n",
    "\n",
    "def build_dataset(words):\n",
    "    # create counts list, set counts for \"UNK\" token to -1 (undefined)\n",
    "    count = [['UNK', -1]]\n",
    "    # add counts of the 49,999 most common tokens in 'words'\n",
    "    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))\n",
    "    # create the dictionary data structure\n",
    "    dictionary = {}\n",
    "    # give a unique integer ID to each token in the dictionary\n",
    "    for word, _ in count:\n",
    "        dictionary[word] = len(dictionary)\n",
    "    # create a list data structure for the data\n",
    "    data = []\n",
    "    # keep track of the number of \"UNK\" token occurrences\n",
    "    unk_count = 0\n",
    "    # for each word in our list of words\n",
    "    for word in words:\n",
    "        # if its in the dictionary, get its index\n",
    "        if word in dictionary:\n",
    "            index = dictionary[word]\n",
    "        # otherwise, set the index equal to zero (index of \"UNK\") and increment the \"UNK\" count\n",
    "        else:\n",
    "            index = 0  # dictionary['UNK']\n",
    "            unk_count += 1\n",
    "        # append its index to the 'data' list structure\n",
    "        data.append(index)\n",
    "    # set the count of \"UNK\" in the 'count' data structure\n",
    "    count[0][1] = unk_count\n",
    "    # invert the dictionary; it becomes (index, word) key-value pairs\n",
    "    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))\n",
    "    # return the data (indices), counts, dictionary, and inverted dictionary\n",
    "    return data, count, dictionary, reverse_dictionary\n",
    "\n",
    "# build the datset\n",
    "data, count, dictionary, reverse_dictionary = build_dataset(words)\n",
    "# free up some memory\n",
    "del words\n",
    "# print out stats\n",
    "print('most common words (+UNK):', count[:10])\n",
    "print('sample data:', data[:10], [reverse_dictionary[i] for i in data[:10]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6700074"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3081 originated -> 12 as\n",
      "3081 originated -> 5239 anarchism\n",
      "12 as -> 6 a\n",
      "12 as -> 3081 originated\n",
      "6 a -> 12 as\n",
      "6 a -> 195 term\n",
      "195 term -> 2 of\n",
      "195 term -> 6 a\n"
     ]
    }
   ],
   "source": [
    "data_index = 0\n",
    "\n",
    "# generate a training batch for the skip-gram model.\n",
    "def generate_batch(batch_size, num_skips, skip_window):\n",
    "    global data_index\n",
    "    # make sure our parameters are self-consistent\n",
    "    assert batch_size % num_skips == 0\n",
    "    assert num_skips <= 2 * skip_window\n",
    "    # create empty batch ndarray using 'batch_size'\n",
    "    batch = np.ndarray(shape=(batch_size), dtype=np.int32)\n",
    "    # create empty labels ndarray using 'batch_size'\n",
    "    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)\n",
    "    # [ skip_window target skip_window ]\n",
    "    span = 2 * skip_window + 1\n",
    "    # create a buffer object for prepping batch data\n",
    "    buffer = collections.deque(maxlen=span)\n",
    "    # for each element in our calculated span, append the datum at 'data_index' and increment 'data_index' moduli the amount of data\n",
    "    for _ in range(span):\n",
    "        buffer.append(data[data_index])\n",
    "        data_index = (data_index + 1) % len(data)\n",
    "    # loop for 'batch_size' // 'num_skips'\n",
    "    for i in range(batch_size // num_skips):\n",
    "         # target label at the center of the buffer\n",
    "        target = skip_window\n",
    "        targets_to_avoid = [skip_window]\n",
    "        # loop for 'num_skips'\n",
    "        for j in range(num_skips):\n",
    "            # loop through all 'targets_to_avoid'\n",
    "            while target in targets_to_avoid:\n",
    "                # pick a random index as target\n",
    "                target = random.randint(0, span - 1)\n",
    "            # put it in 'targets_to_avoid'\n",
    "            targets_to_avoid.append(target)\n",
    "            # set the skip window in the minibatch data\n",
    "            batch[i * num_skips + j] = buffer[skip_window]\n",
    "            # set the target in the minibatch labels\n",
    "            labels[i * num_skips + j, 0] = buffer[target]\n",
    "        # add the data at the current 'data_index' to the buffer\n",
    "        buffer.append(data[data_index])\n",
    "        # increment 'data_index'\n",
    "        data_index = (data_index + 1) % len(data)\n",
    "    # return the minibatch data and corresponding labels\n",
    "    return batch, labels\n",
    "\n",
    "# get a minibatch\n",
    "batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)\n",
    "\n",
    "# print out part of the minibatch to the console\n",
    "for i in range(8):\n",
    "    print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([3081, 3081,   12,   12,    6,    6,  195,  195], dtype=int32)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "batch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[  12],\n",
       "       [5239],\n",
       "       [   6],\n",
       "       [3081],\n",
       "       [  12],\n",
       "       [ 195],\n",
       "       [   2],\n",
       "       [   6]], dtype=int32)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# # Creates a graph.\n",
    "# with tf.device('/gpu:0'):\n",
    "#     a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')\n",
    "#     b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')\n",
    "#     c = tf.matmul(a, b)\n",
    "# # Creates a session with log_device_placement set to True.\n",
    "# config = tf.ConfigProto()\n",
    "# config.gpu_options.allow_growth = True\n",
    "# sess = tf.Session(config=config)\n",
    "# # Runs the op.\n",
    "# print(sess.run(c))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# hyperparameters\n",
    "batch_size = 128\n",
    "embedding_size = 128 # dimension of the embedding vector\n",
    "skip_window = 1 # how many words to consider to left and right\n",
    "num_skips = 2 # how many times to reuse an input to generate a label\n",
    "\n",
    "# we choose random validation dataset to sample nearest neighbors\n",
    "# here, we limit the validation samples to the words that have a low\n",
    "# numeric ID, which are also the most frequently occurring words\n",
    "valid_size = 16 # size of random set of words to evaluate similarity on\n",
    "valid_window = 100 # only pick development samples from the first 'valid_window' words\n",
    "valid_examples = np.random.choice(valid_window, valid_size, replace=False)\n",
    "num_sampled = 64 # number of negative examples to sample\n",
    "\n",
    "# create computation graph\n",
    "graph = tf.Graph()\n",
    "\n",
    "with graph.as_default(), tf.device('/gpu:0'):\n",
    "# with graph.device('/device:CPU:0'):\n",
    "    # input data\n",
    "    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])\n",
    "    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])\n",
    "    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)\n",
    "    \n",
    "    # operations and variables\n",
    "    # look up embeddings for inputs\n",
    "    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))\n",
    "    embed = tf.nn.embedding_lookup(embeddings, train_inputs)\n",
    "\n",
    "    # construct the variables for the NCE loss\n",
    "    nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))\n",
    "    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))\n",
    "\n",
    "    # compute the average NCE loss for the batch.\n",
    "    # tf.nce_loss automatically draws a new sample of the negative labels each time we evaluate the loss.\n",
    "    loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights, biases=nce_biases,\n",
    "                     labels=train_labels, inputs=embed, num_sampled=num_sampled, num_classes=vocabulary_size))\n",
    "    \n",
    "    # construct the SGD optimizer using a learning rate of 1.0\n",
    "    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)\n",
    "\n",
    "    # compute the cosine similarity between minibatch examples and all embeddings\n",
    "    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))\n",
    "    normalized_embeddings = embeddings / norm\n",
    "    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)\n",
    "    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)\n",
    "\n",
    "    # add variable initializer\n",
    "    init = tf.global_variables_initializer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "initialized.\n",
      "Average loss at step  0 :  267.861968994\n",
      "nearest to states: stalks, reflect, reportedly, equated, transmissions, mawr, dj, uncomplicated,\n",
      "nearest to history: consistory, hereby, arbitrator, rushing, incline, irian, woz, bureaucrats,\n",
      "nearest to see: insignificance, setups, unfettered, sure, hernando, technische, aromatic, boudin,\n",
      "nearest to and: manitou, japh, hatfield, deathbed, gime, i, lackluster, configurations,\n",
      "nearest to there: wove, achieve, otimes, psychologically, grammys, saxe, vinge, needle,\n",
      "nearest to if: cinematographers, bromide, plane, declined, flatus, assyrians, coating, sedimentary,\n",
      "nearest to united: heinemann, managing, oyly, bastille, bums, scipio, conceivably, carnarvon,\n",
      "nearest to may: trump, wah, nanosystems, nuh, fond, phoca, hoplites, pseudo,\n",
      "nearest to it: picts, regius, despot, maronites, buckingham, superintendent, synod, versed,\n",
      "nearest to as: freestyle, botched, sigma, claudel, birkhoff, vec, rostock, iter,\n",
      "nearest to has: joliot, lough, affiliations, montego, bertolt, belongs, gerrard, widgery,\n",
      "nearest to no: stosunku, dq, celery, intermarriages, paralleled, completely, gillette, dramatized,\n",
      "nearest to with: bed, backwards, digraphs, cvs, indie, generality, reflected, faber,\n",
      "nearest to s: soured, fees, debbie, chaldea, cosmology, preseason, malleable, ebook,\n",
      "nearest to on: postumus, entirety, lebanese, keeshond, longbow, hanoi, proofs, maintains,\n",
      "nearest to use: wholesome, lowercase, rectangle, strangely, covert, situational, goidelic, vocal,\n",
      "Average loss at step  2000 :  113.505586137\n",
      "Average loss at step  4000 :  52.5892334256\n",
      "Average loss at step  6000 :  33.4853440809\n",
      "Average loss at step  8000 :  23.7135089986\n",
      "Average loss at step  10000 :  18.1392678461\n",
      "nearest to states: reflect, reportedly, transmissions, phi, expanded, topological, marked, dj,\n",
      "nearest to history: reginae, aristotle, private, rushing, that, victoriae, austin, beast,\n",
      "nearest to see: sure, louise, ac, cc, finalist, explains, prison, it,\n",
      "nearest to and: in, of, UNK, victoriae, or, for, one, s,\n",
      "nearest to there: achieve, display, loyalists, october, fifty, formula, psychologically, persephone,\n",
      "nearest to if: altenberg, cl, gold, declined, reginae, agave, coating, joan,\n",
      "nearest to united: advance, johanna, managing, gollancz, gb, includes, book, reginae,\n",
      "nearest to may: rarest, incompatible, tissue, darius, nine, syllable, album, can,\n",
      "nearest to it: he, consumer, samurai, this, interest, torment, also, novels,\n",
      "nearest to as: for, in, homomorphism, sigma, is, and, member, cc,\n",
      "nearest to has: have, reginae, had, experienced, singer, feature, plum, nicotine,\n",
      "nearest to no: completely, and, agave, victoriae, sports, commemoration, analogue, austin,\n",
      "nearest to with: in, and, from, baseball, by, bed, on, for,\n",
      "nearest to s: and, libels, reginae, or, everything, victoriae, transform, compounds,\n",
      "nearest to on: in, for, and, of, entirety, austin, escorial, with,\n",
      "nearest to use: reginae, arch, cl, peak, pacifist, azerbaijani, lowercase, victoriae,\n",
      "Average loss at step  12000 :  13.9590847435\n",
      "Average loss at step  14000 :  11.7571933647\n",
      "Average loss at step  16000 :  9.90769602233\n",
      "Average loss at step  18000 :  8.59951363635\n",
      "Average loss at step  20000 :  7.80482456183\n",
      "nearest to states: reflect, reportedly, stalks, transmissions, topological, agouti, expanded, flee,\n",
      "nearest to history: aristotle, reginae, dasyprocta, that, incline, private, the, rushing,\n",
      "nearest to see: dasyprocta, sure, ac, six, louise, agouti, operatorname, and,\n",
      "nearest to and: or, in, agouti, of, for, UNK, victoriae, with,\n",
      "nearest to there: wove, psychologically, it, achieve, loyalists, which, fifty, circ,\n",
      "nearest to if: agouti, isu, bromide, joan, prevention, when, berman, altenberg,\n",
      "nearest to united: managing, johanna, advance, gollancz, gb, famine, akm, book,\n",
      "nearest to may: can, rarest, fond, incompatible, agouti, astrologers, procedures, administering,\n",
      "nearest to it: he, this, torment, also, annales, which, versed, consumer,\n",
      "nearest to as: and, for, in, is, by, agouti, dasyprocta, was,\n",
      "nearest to has: had, have, was, is, lough, are, plum, reginae,\n",
      "nearest to no: completely, and, commemoration, venerable, pascal, agave, victoriae, a,\n",
      "nearest to with: in, and, from, for, by, agouti, of, on,\n",
      "nearest to s: and, or, the, his, reginae, of, dasyprocta, zero,\n",
      "nearest to on: in, for, and, entirety, circ, with, from, of,\n",
      "nearest to use: agouti, rectangle, reginae, arch, peak, fermilab, mats, vocal,\n",
      "Average loss at step  22000 :  7.20771025956\n",
      "Average loss at step  24000 :  6.90459180343\n",
      "Average loss at step  26000 :  6.6249013586\n",
      "Average loss at step  28000 :  6.17626126552\n",
      "Average loss at step  30000 :  6.15420121288\n",
      "nearest to states: reflect, reportedly, transmissions, stalks, topological, expanded, agouti, brazilian,\n",
      "nearest to history: arbitrator, reginae, aristotle, woz, trinomial, dasyprocta, incline, hereby,\n",
      "nearest to see: dasyprocta, sure, agouti, louise, abitibi, ac, six, four,\n",
      "nearest to and: or, in, akh, of, victoriae, agouti, s, primigenius,\n",
      "nearest to there: it, psychologically, he, which, wove, otimes, they, who,\n",
      "nearest to if: when, agouti, isu, prevention, joan, bromide, had, berman,\n",
      "nearest to united: managing, of, johanna, congestive, advance, mahogany, famine, akm,\n",
      "nearest to may: can, would, incompatible, rarest, fond, nine, astrologers, procedures,\n",
      "nearest to it: he, this, which, torment, also, there, they, annales,\n",
      "nearest to as: by, and, for, dasyprocta, agouti, is, in, homomorphism,\n",
      "nearest to has: had, have, is, was, lough, by, plum, are,\n",
      "nearest to no: a, completely, and, venerable, commemoration, hijacker, agave, trinomial,\n",
      "nearest to with: from, in, and, by, for, agouti, between, under,\n",
      "nearest to s: and, or, his, of, zero, abet, four, trinomial,\n",
      "nearest to on: in, for, and, from, at, with, two, circ,\n",
      "nearest to use: rectangle, agouti, arch, reginae, fermilab, vocal, peak, vapor,\n",
      "Average loss at step  32000 :  5.93763124335\n",
      "Average loss at step  34000 :  5.88674430609\n",
      "Average loss at step  36000 :  5.68349878764\n",
      "Average loss at step  38000 :  5.29927867699\n",
      "Average loss at step  40000 :  5.46713113594\n",
      "nearest to states: reflect, stalks, reportedly, transmissions, topological, expanded, howlin, brazilian,\n",
      "nearest to history: arbitrator, reginae, aristotle, dasyprocta, trinomial, woz, agouti, viridian,\n",
      "nearest to see: goo, dasyprocta, sure, and, six, cc, louise, barbed,\n",
      "nearest to and: or, six, UNK, four, dasyprocta, victoriae, primigenius, agouti,\n",
      "nearest to there: it, which, he, they, psychologically, and, often, still,\n",
      "nearest to if: when, agouti, isu, prevention, joan, berman, kuti, satan,\n",
      "nearest to united: of, managing, johanna, congestive, mahogany, advance, akm, scipio,\n",
      "nearest to may: can, would, incompatible, rarest, astrologers, procedures, fond, to,\n",
      "nearest to it: he, this, which, there, they, torment, that, also,\n",
      "nearest to as: by, dasyprocta, agouti, and, in, for, is, cajun,\n",
      "nearest to has: had, have, is, was, lough, plum, auchinleck, by,\n",
      "nearest to no: a, completely, venerable, and, hijacker, usually, agave, commemoration,\n",
      "nearest to with: from, and, in, between, agouti, by, under, albury,\n",
      "nearest to s: and, his, or, zero, trinomial, reginae, jarman, abet,\n",
      "nearest to on: in, at, for, from, with, circ, dasyprocta, and,\n",
      "nearest to use: agouti, reginae, rectangle, argo, albury, vapor, vocal, peak,\n",
      "Average loss at step  42000 :  5.29190438855\n",
      "Average loss at step  44000 :  5.33473292148\n",
      "Average loss at step  46000 :  5.26311981332\n",
      "Average loss at step  48000 :  5.02547687709\n",
      "Average loss at step  50000 :  5.1276553297\n",
      "nearest to states: reflect, reportedly, stalks, transmissions, topological, prism, expanded, howlin,\n",
      "nearest to history: arbitrator, reginae, trinomial, aristotle, woz, dasyprocta, uncertain, hereby,\n",
      "nearest to see: goo, dasyprocta, sure, barbed, prototype, and, lore, dimethyl,\n",
      "nearest to and: or, but, agouti, akh, victoriae, abitibi, imperative, dasyprocta,\n",
      "nearest to there: it, they, he, which, now, psychologically, still, often,\n",
      "nearest to if: when, isu, agouti, prevention, joan, that, kuti, satan,\n",
      "nearest to united: managing, johanna, of, mahogany, congestive, akm, scipio, advance,\n",
      "nearest to may: can, would, incompatible, could, rarest, astrologers, procedures, will,\n",
      "nearest to it: he, this, there, which, they, torment, also, not,\n",
      "nearest to as: agouti, is, by, dasyprocta, prism, cajun, homomorphism, circ,\n",
      "nearest to has: had, have, was, is, lough, auchinleck, plum, reginae,\n",
      "nearest to no: a, completely, venerable, hijacker, agave, trinomial, usually, pascal,\n",
      "nearest to with: and, from, by, in, between, agouti, for, albury,\n",
      "nearest to s: and, his, trinomial, or, of, zero, jarman, five,\n",
      "nearest to on: in, at, for, circ, from, nguni, dasyprocta, two,\n",
      "nearest to use: agouti, reginae, argo, albury, rectangle, story, nasser, victoriae,\n",
      "Average loss at step  52000 :  5.17703344357\n",
      "Average loss at step  54000 :  5.11836372924\n",
      "Average loss at step  56000 :  5.07739333141\n",
      "Average loss at step  58000 :  5.12823755598\n",
      "Average loss at step  60000 :  4.95967468631\n",
      "nearest to states: reflect, reportedly, transmissions, stalks, topological, tamarin, cebus, prism,\n",
      "nearest to history: callithrix, reginae, dasyprocta, tamarin, agouti, victoriae, trinomial, pulau,\n",
      "nearest to see: goo, dasyprocta, but, can, barbed, sure, four, cardboard,\n",
      "nearest to and: or, tamarin, cebus, but, victoriae, akh, microsite, microcebus,\n",
      "nearest to there: it, they, he, which, now, still, often, psychologically,\n",
      "nearest to if: when, isu, agouti, prevention, then, joan, that, kuti,\n",
      "nearest to united: of, johanna, managing, cebus, mahogany, akm, congestive, famine,\n",
      "nearest to may: can, would, could, will, incompatible, cannot, must, procedures,\n",
      "nearest to it: he, this, there, which, they, tamarin, torment, she,\n",
      "nearest to as: tamarin, capuchin, agouti, dasyprocta, marmoset, in, prism, by,\n",
      "nearest to has: had, have, was, is, lough, auchinleck, dcsd, reginae,\n",
      "nearest to no: a, completely, venerable, trinomial, agave, hijacker, tamarin, victoriae,\n",
      "nearest to with: between, in, and, from, by, under, agouti, albury,\n",
      "nearest to s: callithrix, zero, his, and, microcebus, trinomial, reginae, abet,\n",
      "nearest to on: in, at, cebus, for, tamarin, circ, dasyprocta, iota,\n",
      "nearest to use: tamarin, agouti, argo, reginae, cebus, albury, rectangle, microsite,\n",
      "Average loss at step  62000 :  4.79626031196\n",
      "Average loss at step  64000 :  4.80886316788\n",
      "Average loss at step  66000 :  4.97172088683\n",
      "Average loss at step  68000 :  4.92869670045\n",
      "Average loss at step  70000 :  4.76773341072\n",
      "nearest to states: reflect, reportedly, tamarin, stalks, transmissions, topological, cebus, prism,\n",
      "nearest to history: callithrix, reginae, dasyprocta, tamarin, uncertain, hagbard, agouti, arbitrator,\n",
      "nearest to see: goo, dasyprocta, but, can, sure, barbed, prototype, jellicoe,\n",
      "nearest to and: or, but, cebus, tamarin, victoriae, callithrix, microcebus, abitibi,\n",
      "nearest to there: it, they, which, still, now, he, often, psychologically,\n",
      "nearest to if: when, agouti, isu, then, prevention, joan, however, though,\n",
      "nearest to united: of, johanna, managing, cebus, famine, akm, mahogany, congestive,\n",
      "nearest to may: can, would, could, will, must, incompatible, cannot, should,\n",
      "nearest to it: he, this, there, which, they, she, torment, tamarin,\n",
      "nearest to as: tamarin, agouti, capuchin, dasyprocta, prism, in, homomorphism, is,\n",
      "nearest to has: had, have, was, is, lough, reginae, auchinleck, by,\n",
      "nearest to no: completely, a, venerable, hijacker, trinomial, owing, commemoration, agave,\n",
      "nearest to with: between, in, from, and, isu, agouti, albury, under,\n",
      "nearest to s: callithrix, microcebus, zero, and, reginae, jarman, his, trinomial,\n",
      "nearest to on: in, at, cebus, through, for, from, circ, dasyprocta,\n",
      "nearest to use: tamarin, agouti, albury, rectangle, reginae, microsite, argo, callithrix,\n",
      "Average loss at step  72000 :  4.79812862515\n",
      "Average loss at step  74000 :  4.77556425592\n",
      "Average loss at step  76000 :  4.86499132544\n",
      "Average loss at step  78000 :  4.80580271888\n",
      "Average loss at step  80000 :  4.81581209326\n",
      "nearest to states: reflect, transmissions, reportedly, tamarin, stalks, topological, cebus, howlin,\n",
      "nearest to history: callithrix, reginae, dasyprocta, uncertain, hagbard, tamarin, arbitrator, horrors,\n",
      "nearest to see: goo, but, dasyprocta, barbed, boudin, sure, prototype, anatomical,\n",
      "nearest to and: or, tamarin, but, cebus, microcebus, victoriae, callithrix, eight,\n",
      "nearest to there: it, they, he, now, still, which, often, instances,\n",
      "nearest to if: when, then, agouti, isu, however, though, dist, joan,\n",
      "nearest to united: johanna, managing, scipio, famine, akm, mahogany, congestive, of,\n",
      "nearest to may: can, would, could, will, must, should, cannot, might,\n",
      "nearest to it: he, this, there, which, they, she, tamarin, torment,\n",
      "nearest to as: tamarin, capuchin, agouti, prism, marmoset, dasyprocta, by, claudel,\n",
      "nearest to has: had, have, was, is, lough, auchinleck, bomarc, dickson,\n",
      "nearest to no: completely, venerable, a, hijacker, trinomial, agave, any, microsite,\n",
      "nearest to with: between, in, from, agouti, isu, and, albury, by,\n",
      "nearest to s: zero, callithrix, his, abet, microcebus, five, trinomial, microsite,\n",
      "nearest to on: in, at, through, cebus, from, for, two, circ,\n",
      "nearest to use: tamarin, agouti, albury, reginae, rectangle, microsite, argo, callithrix,\n",
      "Average loss at step  82000 :  4.80138861847\n",
      "Average loss at step  84000 :  4.77657458603\n",
      "Average loss at step  86000 :  4.74599059129\n",
      "Average loss at step  88000 :  4.68782997322\n",
      "Average loss at step  90000 :  4.75792138851\n",
      "nearest to states: reflect, reportedly, transmissions, tamarin, topological, cebus, stalks, howlin,\n",
      "nearest to history: callithrix, reginae, dasyprocta, tamarin, arbitrator, hagbard, agouti, horrors,\n",
      "nearest to see: goo, but, dasyprocta, anatomical, barbed, boudin, can, five,\n",
      "nearest to and: or, but, tamarin, cebus, microcebus, abitibi, while, cegep,\n",
      "nearest to there: they, it, he, still, now, often, which, but,\n",
      "nearest to if: when, then, agouti, though, where, isu, however, is,\n",
      "nearest to united: johanna, of, managing, scipio, mahogany, akm, congestive, famine,\n",
      "nearest to may: can, would, could, will, should, must, cannot, might,\n",
      "nearest to it: he, this, there, she, they, which, tamarin, torment,\n",
      "nearest to as: tamarin, capuchin, agouti, dasyprocta, or, cegep, prism, when,\n",
      "nearest to has: had, have, is, was, lough, dickson, but, since,\n",
      "nearest to no: completely, a, venerable, any, hijacker, trinomial, owing, commemoration,\n",
      "nearest to with: between, in, from, and, isu, agouti, during, ansgar,\n",
      "nearest to s: his, callithrix, microcebus, and, abet, references, trinomial, zero,\n",
      "nearest to on: in, at, cebus, through, upon, for, tamarin, from,\n",
      "nearest to use: tamarin, agouti, argo, reginae, microsite, albury, callithrix, rectangle,\n",
      "Average loss at step  92000 :  4.72107960212\n",
      "Average loss at step  94000 :  4.62920029783\n",
      "Average loss at step  96000 :  4.72291916287\n",
      "Average loss at step  98000 :  4.6251190145\n",
      "Average loss at step  100000 :  4.68731605053\n",
      "nearest to states: reflect, transmissions, reportedly, topological, tamarin, howlin, cebus, territory,\n",
      "nearest to history: callithrix, reginae, dasyprocta, arbitrator, uncertain, tamarin, viridian, hagbard,\n",
      "nearest to see: goo, but, weightings, anatomical, dasyprocta, can, digamma, and,\n",
      "nearest to and: or, but, tamarin, cebus, while, microcebus, however, callithrix,\n",
      "nearest to there: they, it, he, now, still, often, which, instances,\n",
      "nearest to if: when, though, where, then, agouti, isu, however, while,\n",
      "nearest to united: johanna, managing, scipio, of, mahogany, akm, cebus, congestive,\n",
      "nearest to may: can, would, could, will, should, must, might, cannot,\n",
      "nearest to it: he, this, there, she, they, which, tamarin, torment,\n",
      "nearest to as: tamarin, agouti, capuchin, dasyprocta, marmoset, prism, cegep, victoriae,\n",
      "nearest to has: had, have, was, is, lough, dickson, cegep, monophysitism,\n",
      "nearest to no: any, completely, a, venerable, hijacker, trinomial, only, agave,\n",
      "nearest to with: between, in, from, isu, during, agouti, when, ansgar,\n",
      "nearest to s: his, callithrix, abet, trinomial, five, microcebus, reginae, and,\n",
      "nearest to on: in, at, through, upon, cebus, for, dasyprocta, roshan,\n",
      "nearest to use: tamarin, agouti, reginae, argo, albury, most, microsite, callithrix,\n"
     ]
    }
   ],
   "source": [
    "# steps to train the model\n",
    "num_steps = 100001\n",
    "config = tf.ConfigProto(allow_soft_placement=True)\n",
    "config.gpu_options.allow_growth = True\n",
    "\n",
    "with tf.Session(graph=graph, config=config) as sess:\n",
    "    # we must initialize all variables before using them\n",
    "    #     init.run()\n",
    "    sess.run(init)\n",
    "#     tf.global_variables_initializer().run()\n",
    "    print('initialized.')\n",
    "    \n",
    "    # loop through all training steps and keep track of loss\n",
    "    average_loss = 0\n",
    "    for step in range(num_steps):\n",
    "        # generate a minibatch of training data\n",
    "        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)\n",
    "        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}\n",
    "        \n",
    "        # we perform a single update step by evaluating the optimizer operation (including it\n",
    "        # in the list of returned values of sess.run())\n",
    "        _, loss_val = sess.run([optimizer, loss], feed_dict=feed_dict)\n",
    "        average_loss += loss_val\n",
    "        \n",
    "        # print average loss every 2,000 steps\n",
    "        if step % 2000 == 0:\n",
    "            if step > 0:\n",
    "                average_loss /= 2000\n",
    "            # the average loss is an estimate of the loss over the last 2000 batches.\n",
    "            print(\"Average loss at step \", step, \": \", average_loss)\n",
    "            average_loss = 0\n",
    "        \n",
    "        # computing cosine similarity (expensive!)\n",
    "        if step % 10000 == 0:\n",
    "            sim = similarity.eval()\n",
    "            for i in range(valid_size):\n",
    "                # get a single validation sample\n",
    "                valid_word = reverse_dictionary[valid_examples[i]]\n",
    "                # number of nearest neighbors\n",
    "                top_k = 8\n",
    "                # computing nearest neighbors\n",
    "                nearest = (-sim[i, :]).argsort()[1:top_k + 1]\n",
    "                log_str = \"nearest to %s:\" % valid_word\n",
    "                for k in range(top_k):\n",
    "                    close_word = reverse_dictionary[nearest[k]]\n",
    "                    log_str = \"%s %s,\" % (log_str, close_word)\n",
    "                print(log_str)\n",
    "        \n",
    "    final_embeddings = normalized_embeddings.eval()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3(tf)",
   "language": "python",
   "name": "python3_tf"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}