Skip to content

Instantly share code, notes, and snippets.

@koorukuroo
Created October 20, 2017 14:33
Show Gist options
  • Save koorukuroo/63914d36768a97fa5456b84f13df16f7 to your computer and use it in GitHub Desktop.
Save koorukuroo/63914d36768a97fa5456b84f13df16f7 to your computer and use it in GitHub Desktop.
Basic Word2Vec Sample Code
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- https://github.com/tensorflow/tensorflow/blob/r1.3/tensorflow/examples/tutorials/word2vec/word2vec_basic.py\n",
"- https://djsaunde.github.io/word2vec.html\n",
"- http://khanrc.tistory.com/entry/TensorFlow-7-word2vec-Implementation\n",
"- http://solarisailab.com/archives/374\n",
"- https://github.com/danielfrg/word2vec/blob/master/examples/word2vec.ipynb"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import os\n",
"import zipfile\n",
"import tensorflow as tf\n",
"\n",
"import collections\n",
"import numpy as np\n",
"import random\n",
"\n",
"import math"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found and verified text8.zip\n",
"data size: 17005207\n"
]
}
],
"source": [
"# download the data\n",
"url = 'http://mattmahoney.net/dc/'\n",
"\n",
"def download(filename, expected_bytes):\n",
" \"\"\"\n",
" Download a file if not present, and make sure it's the right size.\n",
" \"\"\"\n",
" if not os.path.exists(filename):\n",
" filename, _ = urllib.request.urlretrieve(url + filename, filename)\n",
" statinfo = os.stat(filename)\n",
" if statinfo.st_size == expected_bytes:\n",
" print('Found and verified', filename)\n",
" else:\n",
" print(statinfo.st_size)\n",
" raise Exception('Failed to verify ' + filename + '. Can you get to it with a browser?')\n",
" return filename\n",
"\n",
"# download the file\n",
"file_ = download('text8.zip', 31344016)\n",
"\n",
"def read_data(filename):\n",
" \"\"\"\n",
" Parse the file enclosed in the 'filename' zip file into a list of words.\n",
" \"\"\"\n",
" # unzip the file\n",
" with zipfile.ZipFile(filename) as f:\n",
" # read the data into the 'data' variable\n",
" data = tf.compat.as_str(f.read(f.namelist()[0])).split()\n",
" # return the data\n",
" return data\n",
"\n",
"words = read_data(file_)\n",
"print('data size:', len(words))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"most common words (+UNK): [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764), ('in', 372201), ('a', 325873), ('to', 316376), ('zero', 264975), ('nine', 250430)]\n",
"sample data: [5239, 3081, 12, 6, 195, 2, 3137, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']\n"
]
}
],
"source": [
"# build the dictionary and replace rare words with the \"UNK\" token.\n",
"vocabulary_size = 50000\n",
"\n",
"def build_dataset(words):\n",
" # create counts list, set counts for \"UNK\" token to -1 (undefined)\n",
" count = [['UNK', -1]]\n",
" # add counts of the 49,999 most common tokens in 'words'\n",
" count.extend(collections.Counter(words).most_common(vocabulary_size - 1))\n",
" # create the dictionary data structure\n",
" dictionary = {}\n",
" # give a unique integer ID to each token in the dictionary\n",
" for word, _ in count:\n",
" dictionary[word] = len(dictionary)\n",
" # create a list data structure for the data\n",
" data = []\n",
" # keep track of the number of \"UNK\" token occurrences\n",
" unk_count = 0\n",
" # for each word in our list of words\n",
" for word in words:\n",
" # if its in the dictionary, get its index\n",
" if word in dictionary:\n",
" index = dictionary[word]\n",
" # otherwise, set the index equal to zero (index of \"UNK\") and increment the \"UNK\" count\n",
" else:\n",
" index = 0 # dictionary['UNK']\n",
" unk_count += 1\n",
" # append its index to the 'data' list structure\n",
" data.append(index)\n",
" # set the count of \"UNK\" in the 'count' data structure\n",
" count[0][1] = unk_count\n",
" # invert the dictionary; it becomes (index, word) key-value pairs\n",
" reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))\n",
" # return the data (indices), counts, dictionary, and inverted dictionary\n",
" return data, count, dictionary, reverse_dictionary\n",
"\n",
"# build the datset\n",
"data, count, dictionary, reverse_dictionary = build_dataset(words)\n",
"# free up some memory\n",
"del words\n",
"# print out stats\n",
"print('most common words (+UNK):', count[:10])\n",
"print('sample data:', data[:10], [reverse_dictionary[i] for i in data[:10]])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"6700074"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_index"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3081 originated -> 12 as\n",
"3081 originated -> 5239 anarchism\n",
"12 as -> 6 a\n",
"12 as -> 3081 originated\n",
"6 a -> 12 as\n",
"6 a -> 195 term\n",
"195 term -> 2 of\n",
"195 term -> 6 a\n"
]
}
],
"source": [
"data_index = 0\n",
"\n",
"# generate a training batch for the skip-gram model.\n",
"def generate_batch(batch_size, num_skips, skip_window):\n",
" global data_index\n",
" # make sure our parameters are self-consistent\n",
" assert batch_size % num_skips == 0\n",
" assert num_skips <= 2 * skip_window\n",
" # create empty batch ndarray using 'batch_size'\n",
" batch = np.ndarray(shape=(batch_size), dtype=np.int32)\n",
" # create empty labels ndarray using 'batch_size'\n",
" labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)\n",
" # [ skip_window target skip_window ]\n",
" span = 2 * skip_window + 1\n",
" # create a buffer object for prepping batch data\n",
" buffer = collections.deque(maxlen=span)\n",
" # for each element in our calculated span, append the datum at 'data_index' and increment 'data_index' moduli the amount of data\n",
" for _ in range(span):\n",
" buffer.append(data[data_index])\n",
" data_index = (data_index + 1) % len(data)\n",
" # loop for 'batch_size' // 'num_skips'\n",
" for i in range(batch_size // num_skips):\n",
" # target label at the center of the buffer\n",
" target = skip_window\n",
" targets_to_avoid = [skip_window]\n",
" # loop for 'num_skips'\n",
" for j in range(num_skips):\n",
" # loop through all 'targets_to_avoid'\n",
" while target in targets_to_avoid:\n",
" # pick a random index as target\n",
" target = random.randint(0, span - 1)\n",
" # put it in 'targets_to_avoid'\n",
" targets_to_avoid.append(target)\n",
" # set the skip window in the minibatch data\n",
" batch[i * num_skips + j] = buffer[skip_window]\n",
" # set the target in the minibatch labels\n",
" labels[i * num_skips + j, 0] = buffer[target]\n",
" # add the data at the current 'data_index' to the buffer\n",
" buffer.append(data[data_index])\n",
" # increment 'data_index'\n",
" data_index = (data_index + 1) % len(data)\n",
" # return the minibatch data and corresponding labels\n",
" return batch, labels\n",
"\n",
"# get a minibatch\n",
"batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)\n",
"\n",
"# print out part of the minibatch to the console\n",
"for i in range(8):\n",
" print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([3081, 3081, 12, 12, 6, 6, 195, 195], dtype=int32)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"batch"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 12],\n",
" [5239],\n",
" [ 6],\n",
" [3081],\n",
" [ 12],\n",
" [ 195],\n",
" [ 2],\n",
" [ 6]], dtype=int32)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"labels"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# # Creates a graph.\n",
"# with tf.device('/gpu:0'):\n",
"# a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')\n",
"# b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')\n",
"# c = tf.matmul(a, b)\n",
"# # Creates a session with log_device_placement set to True.\n",
"# config = tf.ConfigProto()\n",
"# config.gpu_options.allow_growth = True\n",
"# sess = tf.Session(config=config)\n",
"# # Runs the op.\n",
"# print(sess.run(c))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# hyperparameters\n",
"batch_size = 128\n",
"embedding_size = 128 # dimension of the embedding vector\n",
"skip_window = 1 # how many words to consider to left and right\n",
"num_skips = 2 # how many times to reuse an input to generate a label\n",
"\n",
"# we choose random validation dataset to sample nearest neighbors\n",
"# here, we limit the validation samples to the words that have a low\n",
"# numeric ID, which are also the most frequently occurring words\n",
"valid_size = 16 # size of random set of words to evaluate similarity on\n",
"valid_window = 100 # only pick development samples from the first 'valid_window' words\n",
"valid_examples = np.random.choice(valid_window, valid_size, replace=False)\n",
"num_sampled = 64 # number of negative examples to sample\n",
"\n",
"# create computation graph\n",
"graph = tf.Graph()\n",
"\n",
"with graph.as_default(), tf.device('/gpu:0'):\n",
"# with graph.device('/device:CPU:0'):\n",
" # input data\n",
" train_inputs = tf.placeholder(tf.int32, shape=[batch_size])\n",
" train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])\n",
" valid_dataset = tf.constant(valid_examples, dtype=tf.int32)\n",
" \n",
" # operations and variables\n",
" # look up embeddings for inputs\n",
" embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))\n",
" embed = tf.nn.embedding_lookup(embeddings, train_inputs)\n",
"\n",
" # construct the variables for the NCE loss\n",
" nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))\n",
" nce_biases = tf.Variable(tf.zeros([vocabulary_size]))\n",
"\n",
" # compute the average NCE loss for the batch.\n",
" # tf.nce_loss automatically draws a new sample of the negative labels each time we evaluate the loss.\n",
" loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights, biases=nce_biases,\n",
" labels=train_labels, inputs=embed, num_sampled=num_sampled, num_classes=vocabulary_size))\n",
" \n",
" # construct the SGD optimizer using a learning rate of 1.0\n",
" optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)\n",
"\n",
" # compute the cosine similarity between minibatch examples and all embeddings\n",
" norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))\n",
" normalized_embeddings = embeddings / norm\n",
" valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)\n",
" similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)\n",
"\n",
" # add variable initializer\n",
" init = tf.global_variables_initializer()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"initialized.\n",
"Average loss at step 0 : 267.861968994\n",
"nearest to states: stalks, reflect, reportedly, equated, transmissions, mawr, dj, uncomplicated,\n",
"nearest to history: consistory, hereby, arbitrator, rushing, incline, irian, woz, bureaucrats,\n",
"nearest to see: insignificance, setups, unfettered, sure, hernando, technische, aromatic, boudin,\n",
"nearest to and: manitou, japh, hatfield, deathbed, gime, i, lackluster, configurations,\n",
"nearest to there: wove, achieve, otimes, psychologically, grammys, saxe, vinge, needle,\n",
"nearest to if: cinematographers, bromide, plane, declined, flatus, assyrians, coating, sedimentary,\n",
"nearest to united: heinemann, managing, oyly, bastille, bums, scipio, conceivably, carnarvon,\n",
"nearest to may: trump, wah, nanosystems, nuh, fond, phoca, hoplites, pseudo,\n",
"nearest to it: picts, regius, despot, maronites, buckingham, superintendent, synod, versed,\n",
"nearest to as: freestyle, botched, sigma, claudel, birkhoff, vec, rostock, iter,\n",
"nearest to has: joliot, lough, affiliations, montego, bertolt, belongs, gerrard, widgery,\n",
"nearest to no: stosunku, dq, celery, intermarriages, paralleled, completely, gillette, dramatized,\n",
"nearest to with: bed, backwards, digraphs, cvs, indie, generality, reflected, faber,\n",
"nearest to s: soured, fees, debbie, chaldea, cosmology, preseason, malleable, ebook,\n",
"nearest to on: postumus, entirety, lebanese, keeshond, longbow, hanoi, proofs, maintains,\n",
"nearest to use: wholesome, lowercase, rectangle, strangely, covert, situational, goidelic, vocal,\n",
"Average loss at step 2000 : 113.505586137\n",
"Average loss at step 4000 : 52.5892334256\n",
"Average loss at step 6000 : 33.4853440809\n",
"Average loss at step 8000 : 23.7135089986\n",
"Average loss at step 10000 : 18.1392678461\n",
"nearest to states: reflect, reportedly, transmissions, phi, expanded, topological, marked, dj,\n",
"nearest to history: reginae, aristotle, private, rushing, that, victoriae, austin, beast,\n",
"nearest to see: sure, louise, ac, cc, finalist, explains, prison, it,\n",
"nearest to and: in, of, UNK, victoriae, or, for, one, s,\n",
"nearest to there: achieve, display, loyalists, october, fifty, formula, psychologically, persephone,\n",
"nearest to if: altenberg, cl, gold, declined, reginae, agave, coating, joan,\n",
"nearest to united: advance, johanna, managing, gollancz, gb, includes, book, reginae,\n",
"nearest to may: rarest, incompatible, tissue, darius, nine, syllable, album, can,\n",
"nearest to it: he, consumer, samurai, this, interest, torment, also, novels,\n",
"nearest to as: for, in, homomorphism, sigma, is, and, member, cc,\n",
"nearest to has: have, reginae, had, experienced, singer, feature, plum, nicotine,\n",
"nearest to no: completely, and, agave, victoriae, sports, commemoration, analogue, austin,\n",
"nearest to with: in, and, from, baseball, by, bed, on, for,\n",
"nearest to s: and, libels, reginae, or, everything, victoriae, transform, compounds,\n",
"nearest to on: in, for, and, of, entirety, austin, escorial, with,\n",
"nearest to use: reginae, arch, cl, peak, pacifist, azerbaijani, lowercase, victoriae,\n",
"Average loss at step 12000 : 13.9590847435\n",
"Average loss at step 14000 : 11.7571933647\n",
"Average loss at step 16000 : 9.90769602233\n",
"Average loss at step 18000 : 8.59951363635\n",
"Average loss at step 20000 : 7.80482456183\n",
"nearest to states: reflect, reportedly, stalks, transmissions, topological, agouti, expanded, flee,\n",
"nearest to history: aristotle, reginae, dasyprocta, that, incline, private, the, rushing,\n",
"nearest to see: dasyprocta, sure, ac, six, louise, agouti, operatorname, and,\n",
"nearest to and: or, in, agouti, of, for, UNK, victoriae, with,\n",
"nearest to there: wove, psychologically, it, achieve, loyalists, which, fifty, circ,\n",
"nearest to if: agouti, isu, bromide, joan, prevention, when, berman, altenberg,\n",
"nearest to united: managing, johanna, advance, gollancz, gb, famine, akm, book,\n",
"nearest to may: can, rarest, fond, incompatible, agouti, astrologers, procedures, administering,\n",
"nearest to it: he, this, torment, also, annales, which, versed, consumer,\n",
"nearest to as: and, for, in, is, by, agouti, dasyprocta, was,\n",
"nearest to has: had, have, was, is, lough, are, plum, reginae,\n",
"nearest to no: completely, and, commemoration, venerable, pascal, agave, victoriae, a,\n",
"nearest to with: in, and, from, for, by, agouti, of, on,\n",
"nearest to s: and, or, the, his, reginae, of, dasyprocta, zero,\n",
"nearest to on: in, for, and, entirety, circ, with, from, of,\n",
"nearest to use: agouti, rectangle, reginae, arch, peak, fermilab, mats, vocal,\n",
"Average loss at step 22000 : 7.20771025956\n",
"Average loss at step 24000 : 6.90459180343\n",
"Average loss at step 26000 : 6.6249013586\n",
"Average loss at step 28000 : 6.17626126552\n",
"Average loss at step 30000 : 6.15420121288\n",
"nearest to states: reflect, reportedly, transmissions, stalks, topological, expanded, agouti, brazilian,\n",
"nearest to history: arbitrator, reginae, aristotle, woz, trinomial, dasyprocta, incline, hereby,\n",
"nearest to see: dasyprocta, sure, agouti, louise, abitibi, ac, six, four,\n",
"nearest to and: or, in, akh, of, victoriae, agouti, s, primigenius,\n",
"nearest to there: it, psychologically, he, which, wove, otimes, they, who,\n",
"nearest to if: when, agouti, isu, prevention, joan, bromide, had, berman,\n",
"nearest to united: managing, of, johanna, congestive, advance, mahogany, famine, akm,\n",
"nearest to may: can, would, incompatible, rarest, fond, nine, astrologers, procedures,\n",
"nearest to it: he, this, which, torment, also, there, they, annales,\n",
"nearest to as: by, and, for, dasyprocta, agouti, is, in, homomorphism,\n",
"nearest to has: had, have, is, was, lough, by, plum, are,\n",
"nearest to no: a, completely, and, venerable, commemoration, hijacker, agave, trinomial,\n",
"nearest to with: from, in, and, by, for, agouti, between, under,\n",
"nearest to s: and, or, his, of, zero, abet, four, trinomial,\n",
"nearest to on: in, for, and, from, at, with, two, circ,\n",
"nearest to use: rectangle, agouti, arch, reginae, fermilab, vocal, peak, vapor,\n",
"Average loss at step 32000 : 5.93763124335\n",
"Average loss at step 34000 : 5.88674430609\n",
"Average loss at step 36000 : 5.68349878764\n",
"Average loss at step 38000 : 5.29927867699\n",
"Average loss at step 40000 : 5.46713113594\n",
"nearest to states: reflect, stalks, reportedly, transmissions, topological, expanded, howlin, brazilian,\n",
"nearest to history: arbitrator, reginae, aristotle, dasyprocta, trinomial, woz, agouti, viridian,\n",
"nearest to see: goo, dasyprocta, sure, and, six, cc, louise, barbed,\n",
"nearest to and: or, six, UNK, four, dasyprocta, victoriae, primigenius, agouti,\n",
"nearest to there: it, which, he, they, psychologically, and, often, still,\n",
"nearest to if: when, agouti, isu, prevention, joan, berman, kuti, satan,\n",
"nearest to united: of, managing, johanna, congestive, mahogany, advance, akm, scipio,\n",
"nearest to may: can, would, incompatible, rarest, astrologers, procedures, fond, to,\n",
"nearest to it: he, this, which, there, they, torment, that, also,\n",
"nearest to as: by, dasyprocta, agouti, and, in, for, is, cajun,\n",
"nearest to has: had, have, is, was, lough, plum, auchinleck, by,\n",
"nearest to no: a, completely, venerable, and, hijacker, usually, agave, commemoration,\n",
"nearest to with: from, and, in, between, agouti, by, under, albury,\n",
"nearest to s: and, his, or, zero, trinomial, reginae, jarman, abet,\n",
"nearest to on: in, at, for, from, with, circ, dasyprocta, and,\n",
"nearest to use: agouti, reginae, rectangle, argo, albury, vapor, vocal, peak,\n",
"Average loss at step 42000 : 5.29190438855\n",
"Average loss at step 44000 : 5.33473292148\n",
"Average loss at step 46000 : 5.26311981332\n",
"Average loss at step 48000 : 5.02547687709\n",
"Average loss at step 50000 : 5.1276553297\n",
"nearest to states: reflect, reportedly, stalks, transmissions, topological, prism, expanded, howlin,\n",
"nearest to history: arbitrator, reginae, trinomial, aristotle, woz, dasyprocta, uncertain, hereby,\n",
"nearest to see: goo, dasyprocta, sure, barbed, prototype, and, lore, dimethyl,\n",
"nearest to and: or, but, agouti, akh, victoriae, abitibi, imperative, dasyprocta,\n",
"nearest to there: it, they, he, which, now, psychologically, still, often,\n",
"nearest to if: when, isu, agouti, prevention, joan, that, kuti, satan,\n",
"nearest to united: managing, johanna, of, mahogany, congestive, akm, scipio, advance,\n",
"nearest to may: can, would, incompatible, could, rarest, astrologers, procedures, will,\n",
"nearest to it: he, this, there, which, they, torment, also, not,\n",
"nearest to as: agouti, is, by, dasyprocta, prism, cajun, homomorphism, circ,\n",
"nearest to has: had, have, was, is, lough, auchinleck, plum, reginae,\n",
"nearest to no: a, completely, venerable, hijacker, agave, trinomial, usually, pascal,\n",
"nearest to with: and, from, by, in, between, agouti, for, albury,\n",
"nearest to s: and, his, trinomial, or, of, zero, jarman, five,\n",
"nearest to on: in, at, for, circ, from, nguni, dasyprocta, two,\n",
"nearest to use: agouti, reginae, argo, albury, rectangle, story, nasser, victoriae,\n",
"Average loss at step 52000 : 5.17703344357\n",
"Average loss at step 54000 : 5.11836372924\n",
"Average loss at step 56000 : 5.07739333141\n",
"Average loss at step 58000 : 5.12823755598\n",
"Average loss at step 60000 : 4.95967468631\n",
"nearest to states: reflect, reportedly, transmissions, stalks, topological, tamarin, cebus, prism,\n",
"nearest to history: callithrix, reginae, dasyprocta, tamarin, agouti, victoriae, trinomial, pulau,\n",
"nearest to see: goo, dasyprocta, but, can, barbed, sure, four, cardboard,\n",
"nearest to and: or, tamarin, cebus, but, victoriae, akh, microsite, microcebus,\n",
"nearest to there: it, they, he, which, now, still, often, psychologically,\n",
"nearest to if: when, isu, agouti, prevention, then, joan, that, kuti,\n",
"nearest to united: of, johanna, managing, cebus, mahogany, akm, congestive, famine,\n",
"nearest to may: can, would, could, will, incompatible, cannot, must, procedures,\n",
"nearest to it: he, this, there, which, they, tamarin, torment, she,\n",
"nearest to as: tamarin, capuchin, agouti, dasyprocta, marmoset, in, prism, by,\n",
"nearest to has: had, have, was, is, lough, auchinleck, dcsd, reginae,\n",
"nearest to no: a, completely, venerable, trinomial, agave, hijacker, tamarin, victoriae,\n",
"nearest to with: between, in, and, from, by, under, agouti, albury,\n",
"nearest to s: callithrix, zero, his, and, microcebus, trinomial, reginae, abet,\n",
"nearest to on: in, at, cebus, for, tamarin, circ, dasyprocta, iota,\n",
"nearest to use: tamarin, agouti, argo, reginae, cebus, albury, rectangle, microsite,\n",
"Average loss at step 62000 : 4.79626031196\n",
"Average loss at step 64000 : 4.80886316788\n",
"Average loss at step 66000 : 4.97172088683\n",
"Average loss at step 68000 : 4.92869670045\n",
"Average loss at step 70000 : 4.76773341072\n",
"nearest to states: reflect, reportedly, tamarin, stalks, transmissions, topological, cebus, prism,\n",
"nearest to history: callithrix, reginae, dasyprocta, tamarin, uncertain, hagbard, agouti, arbitrator,\n",
"nearest to see: goo, dasyprocta, but, can, sure, barbed, prototype, jellicoe,\n",
"nearest to and: or, but, cebus, tamarin, victoriae, callithrix, microcebus, abitibi,\n",
"nearest to there: it, they, which, still, now, he, often, psychologically,\n",
"nearest to if: when, agouti, isu, then, prevention, joan, however, though,\n",
"nearest to united: of, johanna, managing, cebus, famine, akm, mahogany, congestive,\n",
"nearest to may: can, would, could, will, must, incompatible, cannot, should,\n",
"nearest to it: he, this, there, which, they, she, torment, tamarin,\n",
"nearest to as: tamarin, agouti, capuchin, dasyprocta, prism, in, homomorphism, is,\n",
"nearest to has: had, have, was, is, lough, reginae, auchinleck, by,\n",
"nearest to no: completely, a, venerable, hijacker, trinomial, owing, commemoration, agave,\n",
"nearest to with: between, in, from, and, isu, agouti, albury, under,\n",
"nearest to s: callithrix, microcebus, zero, and, reginae, jarman, his, trinomial,\n",
"nearest to on: in, at, cebus, through, for, from, circ, dasyprocta,\n",
"nearest to use: tamarin, agouti, albury, rectangle, reginae, microsite, argo, callithrix,\n",
"Average loss at step 72000 : 4.79812862515\n",
"Average loss at step 74000 : 4.77556425592\n",
"Average loss at step 76000 : 4.86499132544\n",
"Average loss at step 78000 : 4.80580271888\n",
"Average loss at step 80000 : 4.81581209326\n",
"nearest to states: reflect, transmissions, reportedly, tamarin, stalks, topological, cebus, howlin,\n",
"nearest to history: callithrix, reginae, dasyprocta, uncertain, hagbard, tamarin, arbitrator, horrors,\n",
"nearest to see: goo, but, dasyprocta, barbed, boudin, sure, prototype, anatomical,\n",
"nearest to and: or, tamarin, but, cebus, microcebus, victoriae, callithrix, eight,\n",
"nearest to there: it, they, he, now, still, which, often, instances,\n",
"nearest to if: when, then, agouti, isu, however, though, dist, joan,\n",
"nearest to united: johanna, managing, scipio, famine, akm, mahogany, congestive, of,\n",
"nearest to may: can, would, could, will, must, should, cannot, might,\n",
"nearest to it: he, this, there, which, they, she, tamarin, torment,\n",
"nearest to as: tamarin, capuchin, agouti, prism, marmoset, dasyprocta, by, claudel,\n",
"nearest to has: had, have, was, is, lough, auchinleck, bomarc, dickson,\n",
"nearest to no: completely, venerable, a, hijacker, trinomial, agave, any, microsite,\n",
"nearest to with: between, in, from, agouti, isu, and, albury, by,\n",
"nearest to s: zero, callithrix, his, abet, microcebus, five, trinomial, microsite,\n",
"nearest to on: in, at, through, cebus, from, for, two, circ,\n",
"nearest to use: tamarin, agouti, albury, reginae, rectangle, microsite, argo, callithrix,\n",
"Average loss at step 82000 : 4.80138861847\n",
"Average loss at step 84000 : 4.77657458603\n",
"Average loss at step 86000 : 4.74599059129\n",
"Average loss at step 88000 : 4.68782997322\n",
"Average loss at step 90000 : 4.75792138851\n",
"nearest to states: reflect, reportedly, transmissions, tamarin, topological, cebus, stalks, howlin,\n",
"nearest to history: callithrix, reginae, dasyprocta, tamarin, arbitrator, hagbard, agouti, horrors,\n",
"nearest to see: goo, but, dasyprocta, anatomical, barbed, boudin, can, five,\n",
"nearest to and: or, but, tamarin, cebus, microcebus, abitibi, while, cegep,\n",
"nearest to there: they, it, he, still, now, often, which, but,\n",
"nearest to if: when, then, agouti, though, where, isu, however, is,\n",
"nearest to united: johanna, of, managing, scipio, mahogany, akm, congestive, famine,\n",
"nearest to may: can, would, could, will, should, must, cannot, might,\n",
"nearest to it: he, this, there, she, they, which, tamarin, torment,\n",
"nearest to as: tamarin, capuchin, agouti, dasyprocta, or, cegep, prism, when,\n",
"nearest to has: had, have, is, was, lough, dickson, but, since,\n",
"nearest to no: completely, a, venerable, any, hijacker, trinomial, owing, commemoration,\n",
"nearest to with: between, in, from, and, isu, agouti, during, ansgar,\n",
"nearest to s: his, callithrix, microcebus, and, abet, references, trinomial, zero,\n",
"nearest to on: in, at, cebus, through, upon, for, tamarin, from,\n",
"nearest to use: tamarin, agouti, argo, reginae, microsite, albury, callithrix, rectangle,\n",
"Average loss at step 92000 : 4.72107960212\n",
"Average loss at step 94000 : 4.62920029783\n",
"Average loss at step 96000 : 4.72291916287\n",
"Average loss at step 98000 : 4.6251190145\n",
"Average loss at step 100000 : 4.68731605053\n",
"nearest to states: reflect, transmissions, reportedly, topological, tamarin, howlin, cebus, territory,\n",
"nearest to history: callithrix, reginae, dasyprocta, arbitrator, uncertain, tamarin, viridian, hagbard,\n",
"nearest to see: goo, but, weightings, anatomical, dasyprocta, can, digamma, and,\n",
"nearest to and: or, but, tamarin, cebus, while, microcebus, however, callithrix,\n",
"nearest to there: they, it, he, now, still, often, which, instances,\n",
"nearest to if: when, though, where, then, agouti, isu, however, while,\n",
"nearest to united: johanna, managing, scipio, of, mahogany, akm, cebus, congestive,\n",
"nearest to may: can, would, could, will, should, must, might, cannot,\n",
"nearest to it: he, this, there, she, they, which, tamarin, torment,\n",
"nearest to as: tamarin, agouti, capuchin, dasyprocta, marmoset, prism, cegep, victoriae,\n",
"nearest to has: had, have, was, is, lough, dickson, cegep, monophysitism,\n",
"nearest to no: any, completely, a, venerable, hijacker, trinomial, only, agave,\n",
"nearest to with: between, in, from, isu, during, agouti, when, ansgar,\n",
"nearest to s: his, callithrix, abet, trinomial, five, microcebus, reginae, and,\n",
"nearest to on: in, at, through, upon, cebus, for, dasyprocta, roshan,\n",
"nearest to use: tamarin, agouti, reginae, argo, albury, most, microsite, callithrix,\n"
]
}
],
"source": [
"# steps to train the model\n",
"num_steps = 100001\n",
"config = tf.ConfigProto(allow_soft_placement=True)\n",
"config.gpu_options.allow_growth = True\n",
"\n",
"with tf.Session(graph=graph, config=config) as sess:\n",
" # we must initialize all variables before using them\n",
" # init.run()\n",
" sess.run(init)\n",
"# tf.global_variables_initializer().run()\n",
" print('initialized.')\n",
" \n",
" # loop through all training steps and keep track of loss\n",
" average_loss = 0\n",
" for step in range(num_steps):\n",
" # generate a minibatch of training data\n",
" batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)\n",
" feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}\n",
" \n",
" # we perform a single update step by evaluating the optimizer operation (including it\n",
" # in the list of returned values of sess.run())\n",
" _, loss_val = sess.run([optimizer, loss], feed_dict=feed_dict)\n",
" average_loss += loss_val\n",
" \n",
" # print average loss every 2,000 steps\n",
" if step % 2000 == 0:\n",
" if step > 0:\n",
" average_loss /= 2000\n",
" # the average loss is an estimate of the loss over the last 2000 batches.\n",
" print(\"Average loss at step \", step, \": \", average_loss)\n",
" average_loss = 0\n",
" \n",
" # computing cosine similarity (expensive!)\n",
" if step % 10000 == 0:\n",
" sim = similarity.eval()\n",
" for i in range(valid_size):\n",
" # get a single validation sample\n",
" valid_word = reverse_dictionary[valid_examples[i]]\n",
" # number of nearest neighbors\n",
" top_k = 8\n",
" # computing nearest neighbors\n",
" nearest = (-sim[i, :]).argsort()[1:top_k + 1]\n",
" log_str = \"nearest to %s:\" % valid_word\n",
" for k in range(top_k):\n",
" close_word = reverse_dictionary[nearest[k]]\n",
" log_str = \"%s %s,\" % (log_str, close_word)\n",
" print(log_str)\n",
" \n",
" final_embeddings = normalized_embeddings.eval()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3(tf)",
"language": "python",
"name": "python3_tf"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment