Created
October 20, 2017 14:33
-
-
Save koorukuroo/63914d36768a97fa5456b84f13df16f7 to your computer and use it in GitHub Desktop.
Basic Word2Vec Sample Code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- https://github.com/tensorflow/tensorflow/blob/r1.3/tensorflow/examples/tutorials/word2vec/word2vec_basic.py\n", | |
"- https://djsaunde.github.io/word2vec.html\n", | |
"- http://khanrc.tistory.com/entry/TensorFlow-7-word2vec-Implementation\n", | |
"- http://solarisailab.com/archives/374\n", | |
"- https://github.com/danielfrg/word2vec/blob/master/examples/word2vec.ipynb" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"import zipfile\n", | |
"import tensorflow as tf\n", | |
"\n", | |
"import collections\n", | |
"import numpy as np\n", | |
"import random\n", | |
"\n", | |
"import math" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Found and verified text8.zip\n", | |
"data size: 17005207\n" | |
] | |
} | |
], | |
"source": [ | |
"# download the data\n", | |
"url = 'http://mattmahoney.net/dc/'\n", | |
"\n", | |
"def download(filename, expected_bytes):\n", | |
" \"\"\"\n", | |
" Download a file if not present, and make sure it's the right size.\n", | |
" \"\"\"\n", | |
" if not os.path.exists(filename):\n", | |
" filename, _ = urllib.request.urlretrieve(url + filename, filename)\n", | |
" statinfo = os.stat(filename)\n", | |
" if statinfo.st_size == expected_bytes:\n", | |
" print('Found and verified', filename)\n", | |
" else:\n", | |
" print(statinfo.st_size)\n", | |
" raise Exception('Failed to verify ' + filename + '. Can you get to it with a browser?')\n", | |
" return filename\n", | |
"\n", | |
"# download the file\n", | |
"file_ = download('text8.zip', 31344016)\n", | |
"\n", | |
"def read_data(filename):\n", | |
" \"\"\"\n", | |
" Parse the file enclosed in the 'filename' zip file into a list of words.\n", | |
" \"\"\"\n", | |
" # unzip the file\n", | |
" with zipfile.ZipFile(filename) as f:\n", | |
" # read the data into the 'data' variable\n", | |
" data = tf.compat.as_str(f.read(f.namelist()[0])).split()\n", | |
" # return the data\n", | |
" return data\n", | |
"\n", | |
"words = read_data(file_)\n", | |
"print('data size:', len(words))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"most common words (+UNK): [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764), ('in', 372201), ('a', 325873), ('to', 316376), ('zero', 264975), ('nine', 250430)]\n", | |
"sample data: [5239, 3081, 12, 6, 195, 2, 3137, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']\n" | |
] | |
} | |
], | |
"source": [ | |
"# build the dictionary and replace rare words with the \"UNK\" token.\n", | |
"vocabulary_size = 50000\n", | |
"\n", | |
"def build_dataset(words):\n", | |
" # create counts list, set counts for \"UNK\" token to -1 (undefined)\n", | |
" count = [['UNK', -1]]\n", | |
" # add counts of the 49,999 most common tokens in 'words'\n", | |
" count.extend(collections.Counter(words).most_common(vocabulary_size - 1))\n", | |
" # create the dictionary data structure\n", | |
" dictionary = {}\n", | |
" # give a unique integer ID to each token in the dictionary\n", | |
" for word, _ in count:\n", | |
" dictionary[word] = len(dictionary)\n", | |
" # create a list data structure for the data\n", | |
" data = []\n", | |
" # keep track of the number of \"UNK\" token occurrences\n", | |
" unk_count = 0\n", | |
" # for each word in our list of words\n", | |
" for word in words:\n", | |
" # if its in the dictionary, get its index\n", | |
" if word in dictionary:\n", | |
" index = dictionary[word]\n", | |
" # otherwise, set the index equal to zero (index of \"UNK\") and increment the \"UNK\" count\n", | |
" else:\n", | |
" index = 0 # dictionary['UNK']\n", | |
" unk_count += 1\n", | |
" # append its index to the 'data' list structure\n", | |
" data.append(index)\n", | |
" # set the count of \"UNK\" in the 'count' data structure\n", | |
" count[0][1] = unk_count\n", | |
" # invert the dictionary; it becomes (index, word) key-value pairs\n", | |
" reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))\n", | |
" # return the data (indices), counts, dictionary, and inverted dictionary\n", | |
" return data, count, dictionary, reverse_dictionary\n", | |
"\n", | |
"# build the datset\n", | |
"data, count, dictionary, reverse_dictionary = build_dataset(words)\n", | |
"# free up some memory\n", | |
"del words\n", | |
"# print out stats\n", | |
"print('most common words (+UNK):', count[:10])\n", | |
"print('sample data:', data[:10], [reverse_dictionary[i] for i in data[:10]])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"6700074" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data_index" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"3081 originated -> 12 as\n", | |
"3081 originated -> 5239 anarchism\n", | |
"12 as -> 6 a\n", | |
"12 as -> 3081 originated\n", | |
"6 a -> 12 as\n", | |
"6 a -> 195 term\n", | |
"195 term -> 2 of\n", | |
"195 term -> 6 a\n" | |
] | |
} | |
], | |
"source": [ | |
"data_index = 0\n", | |
"\n", | |
"# generate a training batch for the skip-gram model.\n", | |
"def generate_batch(batch_size, num_skips, skip_window):\n", | |
" global data_index\n", | |
" # make sure our parameters are self-consistent\n", | |
" assert batch_size % num_skips == 0\n", | |
" assert num_skips <= 2 * skip_window\n", | |
" # create empty batch ndarray using 'batch_size'\n", | |
" batch = np.ndarray(shape=(batch_size), dtype=np.int32)\n", | |
" # create empty labels ndarray using 'batch_size'\n", | |
" labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)\n", | |
" # [ skip_window target skip_window ]\n", | |
" span = 2 * skip_window + 1\n", | |
" # create a buffer object for prepping batch data\n", | |
" buffer = collections.deque(maxlen=span)\n", | |
" # for each element in our calculated span, append the datum at 'data_index' and increment 'data_index' moduli the amount of data\n", | |
" for _ in range(span):\n", | |
" buffer.append(data[data_index])\n", | |
" data_index = (data_index + 1) % len(data)\n", | |
" # loop for 'batch_size' // 'num_skips'\n", | |
" for i in range(batch_size // num_skips):\n", | |
" # target label at the center of the buffer\n", | |
" target = skip_window\n", | |
" targets_to_avoid = [skip_window]\n", | |
" # loop for 'num_skips'\n", | |
" for j in range(num_skips):\n", | |
" # loop through all 'targets_to_avoid'\n", | |
" while target in targets_to_avoid:\n", | |
" # pick a random index as target\n", | |
" target = random.randint(0, span - 1)\n", | |
" # put it in 'targets_to_avoid'\n", | |
" targets_to_avoid.append(target)\n", | |
" # set the skip window in the minibatch data\n", | |
" batch[i * num_skips + j] = buffer[skip_window]\n", | |
" # set the target in the minibatch labels\n", | |
" labels[i * num_skips + j, 0] = buffer[target]\n", | |
" # add the data at the current 'data_index' to the buffer\n", | |
" buffer.append(data[data_index])\n", | |
" # increment 'data_index'\n", | |
" data_index = (data_index + 1) % len(data)\n", | |
" # return the minibatch data and corresponding labels\n", | |
" return batch, labels\n", | |
"\n", | |
"# get a minibatch\n", | |
"batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)\n", | |
"\n", | |
"# print out part of the minibatch to the console\n", | |
"for i in range(8):\n", | |
" print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([3081, 3081, 12, 12, 6, 6, 195, 195], dtype=int32)" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"batch" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 12],\n", | |
" [5239],\n", | |
" [ 6],\n", | |
" [3081],\n", | |
" [ 12],\n", | |
" [ 195],\n", | |
" [ 2],\n", | |
" [ 6]], dtype=int32)" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"labels" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# # Creates a graph.\n", | |
"# with tf.device('/gpu:0'):\n", | |
"# a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')\n", | |
"# b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')\n", | |
"# c = tf.matmul(a, b)\n", | |
"# # Creates a session with log_device_placement set to True.\n", | |
"# config = tf.ConfigProto()\n", | |
"# config.gpu_options.allow_growth = True\n", | |
"# sess = tf.Session(config=config)\n", | |
"# # Runs the op.\n", | |
"# print(sess.run(c))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# hyperparameters\n", | |
"batch_size = 128\n", | |
"embedding_size = 128 # dimension of the embedding vector\n", | |
"skip_window = 1 # how many words to consider to left and right\n", | |
"num_skips = 2 # how many times to reuse an input to generate a label\n", | |
"\n", | |
"# we choose random validation dataset to sample nearest neighbors\n", | |
"# here, we limit the validation samples to the words that have a low\n", | |
"# numeric ID, which are also the most frequently occurring words\n", | |
"valid_size = 16 # size of random set of words to evaluate similarity on\n", | |
"valid_window = 100 # only pick development samples from the first 'valid_window' words\n", | |
"valid_examples = np.random.choice(valid_window, valid_size, replace=False)\n", | |
"num_sampled = 64 # number of negative examples to sample\n", | |
"\n", | |
"# create computation graph\n", | |
"graph = tf.Graph()\n", | |
"\n", | |
"with graph.as_default(), tf.device('/gpu:0'):\n", | |
"# with graph.device('/device:CPU:0'):\n", | |
" # input data\n", | |
" train_inputs = tf.placeholder(tf.int32, shape=[batch_size])\n", | |
" train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])\n", | |
" valid_dataset = tf.constant(valid_examples, dtype=tf.int32)\n", | |
" \n", | |
" # operations and variables\n", | |
" # look up embeddings for inputs\n", | |
" embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))\n", | |
" embed = tf.nn.embedding_lookup(embeddings, train_inputs)\n", | |
"\n", | |
" # construct the variables for the NCE loss\n", | |
" nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))\n", | |
" nce_biases = tf.Variable(tf.zeros([vocabulary_size]))\n", | |
"\n", | |
" # compute the average NCE loss for the batch.\n", | |
" # tf.nce_loss automatically draws a new sample of the negative labels each time we evaluate the loss.\n", | |
" loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights, biases=nce_biases,\n", | |
" labels=train_labels, inputs=embed, num_sampled=num_sampled, num_classes=vocabulary_size))\n", | |
" \n", | |
" # construct the SGD optimizer using a learning rate of 1.0\n", | |
" optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)\n", | |
"\n", | |
" # compute the cosine similarity between minibatch examples and all embeddings\n", | |
" norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))\n", | |
" normalized_embeddings = embeddings / norm\n", | |
" valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)\n", | |
" similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)\n", | |
"\n", | |
" # add variable initializer\n", | |
" init = tf.global_variables_initializer()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"initialized.\n", | |
"Average loss at step 0 : 267.861968994\n", | |
"nearest to states: stalks, reflect, reportedly, equated, transmissions, mawr, dj, uncomplicated,\n", | |
"nearest to history: consistory, hereby, arbitrator, rushing, incline, irian, woz, bureaucrats,\n", | |
"nearest to see: insignificance, setups, unfettered, sure, hernando, technische, aromatic, boudin,\n", | |
"nearest to and: manitou, japh, hatfield, deathbed, gime, i, lackluster, configurations,\n", | |
"nearest to there: wove, achieve, otimes, psychologically, grammys, saxe, vinge, needle,\n", | |
"nearest to if: cinematographers, bromide, plane, declined, flatus, assyrians, coating, sedimentary,\n", | |
"nearest to united: heinemann, managing, oyly, bastille, bums, scipio, conceivably, carnarvon,\n", | |
"nearest to may: trump, wah, nanosystems, nuh, fond, phoca, hoplites, pseudo,\n", | |
"nearest to it: picts, regius, despot, maronites, buckingham, superintendent, synod, versed,\n", | |
"nearest to as: freestyle, botched, sigma, claudel, birkhoff, vec, rostock, iter,\n", | |
"nearest to has: joliot, lough, affiliations, montego, bertolt, belongs, gerrard, widgery,\n", | |
"nearest to no: stosunku, dq, celery, intermarriages, paralleled, completely, gillette, dramatized,\n", | |
"nearest to with: bed, backwards, digraphs, cvs, indie, generality, reflected, faber,\n", | |
"nearest to s: soured, fees, debbie, chaldea, cosmology, preseason, malleable, ebook,\n", | |
"nearest to on: postumus, entirety, lebanese, keeshond, longbow, hanoi, proofs, maintains,\n", | |
"nearest to use: wholesome, lowercase, rectangle, strangely, covert, situational, goidelic, vocal,\n", | |
"Average loss at step 2000 : 113.505586137\n", | |
"Average loss at step 4000 : 52.5892334256\n", | |
"Average loss at step 6000 : 33.4853440809\n", | |
"Average loss at step 8000 : 23.7135089986\n", | |
"Average loss at step 10000 : 18.1392678461\n", | |
"nearest to states: reflect, reportedly, transmissions, phi, expanded, topological, marked, dj,\n", | |
"nearest to history: reginae, aristotle, private, rushing, that, victoriae, austin, beast,\n", | |
"nearest to see: sure, louise, ac, cc, finalist, explains, prison, it,\n", | |
"nearest to and: in, of, UNK, victoriae, or, for, one, s,\n", | |
"nearest to there: achieve, display, loyalists, october, fifty, formula, psychologically, persephone,\n", | |
"nearest to if: altenberg, cl, gold, declined, reginae, agave, coating, joan,\n", | |
"nearest to united: advance, johanna, managing, gollancz, gb, includes, book, reginae,\n", | |
"nearest to may: rarest, incompatible, tissue, darius, nine, syllable, album, can,\n", | |
"nearest to it: he, consumer, samurai, this, interest, torment, also, novels,\n", | |
"nearest to as: for, in, homomorphism, sigma, is, and, member, cc,\n", | |
"nearest to has: have, reginae, had, experienced, singer, feature, plum, nicotine,\n", | |
"nearest to no: completely, and, agave, victoriae, sports, commemoration, analogue, austin,\n", | |
"nearest to with: in, and, from, baseball, by, bed, on, for,\n", | |
"nearest to s: and, libels, reginae, or, everything, victoriae, transform, compounds,\n", | |
"nearest to on: in, for, and, of, entirety, austin, escorial, with,\n", | |
"nearest to use: reginae, arch, cl, peak, pacifist, azerbaijani, lowercase, victoriae,\n", | |
"Average loss at step 12000 : 13.9590847435\n", | |
"Average loss at step 14000 : 11.7571933647\n", | |
"Average loss at step 16000 : 9.90769602233\n", | |
"Average loss at step 18000 : 8.59951363635\n", | |
"Average loss at step 20000 : 7.80482456183\n", | |
"nearest to states: reflect, reportedly, stalks, transmissions, topological, agouti, expanded, flee,\n", | |
"nearest to history: aristotle, reginae, dasyprocta, that, incline, private, the, rushing,\n", | |
"nearest to see: dasyprocta, sure, ac, six, louise, agouti, operatorname, and,\n", | |
"nearest to and: or, in, agouti, of, for, UNK, victoriae, with,\n", | |
"nearest to there: wove, psychologically, it, achieve, loyalists, which, fifty, circ,\n", | |
"nearest to if: agouti, isu, bromide, joan, prevention, when, berman, altenberg,\n", | |
"nearest to united: managing, johanna, advance, gollancz, gb, famine, akm, book,\n", | |
"nearest to may: can, rarest, fond, incompatible, agouti, astrologers, procedures, administering,\n", | |
"nearest to it: he, this, torment, also, annales, which, versed, consumer,\n", | |
"nearest to as: and, for, in, is, by, agouti, dasyprocta, was,\n", | |
"nearest to has: had, have, was, is, lough, are, plum, reginae,\n", | |
"nearest to no: completely, and, commemoration, venerable, pascal, agave, victoriae, a,\n", | |
"nearest to with: in, and, from, for, by, agouti, of, on,\n", | |
"nearest to s: and, or, the, his, reginae, of, dasyprocta, zero,\n", | |
"nearest to on: in, for, and, entirety, circ, with, from, of,\n", | |
"nearest to use: agouti, rectangle, reginae, arch, peak, fermilab, mats, vocal,\n", | |
"Average loss at step 22000 : 7.20771025956\n", | |
"Average loss at step 24000 : 6.90459180343\n", | |
"Average loss at step 26000 : 6.6249013586\n", | |
"Average loss at step 28000 : 6.17626126552\n", | |
"Average loss at step 30000 : 6.15420121288\n", | |
"nearest to states: reflect, reportedly, transmissions, stalks, topological, expanded, agouti, brazilian,\n", | |
"nearest to history: arbitrator, reginae, aristotle, woz, trinomial, dasyprocta, incline, hereby,\n", | |
"nearest to see: dasyprocta, sure, agouti, louise, abitibi, ac, six, four,\n", | |
"nearest to and: or, in, akh, of, victoriae, agouti, s, primigenius,\n", | |
"nearest to there: it, psychologically, he, which, wove, otimes, they, who,\n", | |
"nearest to if: when, agouti, isu, prevention, joan, bromide, had, berman,\n", | |
"nearest to united: managing, of, johanna, congestive, advance, mahogany, famine, akm,\n", | |
"nearest to may: can, would, incompatible, rarest, fond, nine, astrologers, procedures,\n", | |
"nearest to it: he, this, which, torment, also, there, they, annales,\n", | |
"nearest to as: by, and, for, dasyprocta, agouti, is, in, homomorphism,\n", | |
"nearest to has: had, have, is, was, lough, by, plum, are,\n", | |
"nearest to no: a, completely, and, venerable, commemoration, hijacker, agave, trinomial,\n", | |
"nearest to with: from, in, and, by, for, agouti, between, under,\n", | |
"nearest to s: and, or, his, of, zero, abet, four, trinomial,\n", | |
"nearest to on: in, for, and, from, at, with, two, circ,\n", | |
"nearest to use: rectangle, agouti, arch, reginae, fermilab, vocal, peak, vapor,\n", | |
"Average loss at step 32000 : 5.93763124335\n", | |
"Average loss at step 34000 : 5.88674430609\n", | |
"Average loss at step 36000 : 5.68349878764\n", | |
"Average loss at step 38000 : 5.29927867699\n", | |
"Average loss at step 40000 : 5.46713113594\n", | |
"nearest to states: reflect, stalks, reportedly, transmissions, topological, expanded, howlin, brazilian,\n", | |
"nearest to history: arbitrator, reginae, aristotle, dasyprocta, trinomial, woz, agouti, viridian,\n", | |
"nearest to see: goo, dasyprocta, sure, and, six, cc, louise, barbed,\n", | |
"nearest to and: or, six, UNK, four, dasyprocta, victoriae, primigenius, agouti,\n", | |
"nearest to there: it, which, he, they, psychologically, and, often, still,\n", | |
"nearest to if: when, agouti, isu, prevention, joan, berman, kuti, satan,\n", | |
"nearest to united: of, managing, johanna, congestive, mahogany, advance, akm, scipio,\n", | |
"nearest to may: can, would, incompatible, rarest, astrologers, procedures, fond, to,\n", | |
"nearest to it: he, this, which, there, they, torment, that, also,\n", | |
"nearest to as: by, dasyprocta, agouti, and, in, for, is, cajun,\n", | |
"nearest to has: had, have, is, was, lough, plum, auchinleck, by,\n", | |
"nearest to no: a, completely, venerable, and, hijacker, usually, agave, commemoration,\n", | |
"nearest to with: from, and, in, between, agouti, by, under, albury,\n", | |
"nearest to s: and, his, or, zero, trinomial, reginae, jarman, abet,\n", | |
"nearest to on: in, at, for, from, with, circ, dasyprocta, and,\n", | |
"nearest to use: agouti, reginae, rectangle, argo, albury, vapor, vocal, peak,\n", | |
"Average loss at step 42000 : 5.29190438855\n", | |
"Average loss at step 44000 : 5.33473292148\n", | |
"Average loss at step 46000 : 5.26311981332\n", | |
"Average loss at step 48000 : 5.02547687709\n", | |
"Average loss at step 50000 : 5.1276553297\n", | |
"nearest to states: reflect, reportedly, stalks, transmissions, topological, prism, expanded, howlin,\n", | |
"nearest to history: arbitrator, reginae, trinomial, aristotle, woz, dasyprocta, uncertain, hereby,\n", | |
"nearest to see: goo, dasyprocta, sure, barbed, prototype, and, lore, dimethyl,\n", | |
"nearest to and: or, but, agouti, akh, victoriae, abitibi, imperative, dasyprocta,\n", | |
"nearest to there: it, they, he, which, now, psychologically, still, often,\n", | |
"nearest to if: when, isu, agouti, prevention, joan, that, kuti, satan,\n", | |
"nearest to united: managing, johanna, of, mahogany, congestive, akm, scipio, advance,\n", | |
"nearest to may: can, would, incompatible, could, rarest, astrologers, procedures, will,\n", | |
"nearest to it: he, this, there, which, they, torment, also, not,\n", | |
"nearest to as: agouti, is, by, dasyprocta, prism, cajun, homomorphism, circ,\n", | |
"nearest to has: had, have, was, is, lough, auchinleck, plum, reginae,\n", | |
"nearest to no: a, completely, venerable, hijacker, agave, trinomial, usually, pascal,\n", | |
"nearest to with: and, from, by, in, between, agouti, for, albury,\n", | |
"nearest to s: and, his, trinomial, or, of, zero, jarman, five,\n", | |
"nearest to on: in, at, for, circ, from, nguni, dasyprocta, two,\n", | |
"nearest to use: agouti, reginae, argo, albury, rectangle, story, nasser, victoriae,\n", | |
"Average loss at step 52000 : 5.17703344357\n", | |
"Average loss at step 54000 : 5.11836372924\n", | |
"Average loss at step 56000 : 5.07739333141\n", | |
"Average loss at step 58000 : 5.12823755598\n", | |
"Average loss at step 60000 : 4.95967468631\n", | |
"nearest to states: reflect, reportedly, transmissions, stalks, topological, tamarin, cebus, prism,\n", | |
"nearest to history: callithrix, reginae, dasyprocta, tamarin, agouti, victoriae, trinomial, pulau,\n", | |
"nearest to see: goo, dasyprocta, but, can, barbed, sure, four, cardboard,\n", | |
"nearest to and: or, tamarin, cebus, but, victoriae, akh, microsite, microcebus,\n", | |
"nearest to there: it, they, he, which, now, still, often, psychologically,\n", | |
"nearest to if: when, isu, agouti, prevention, then, joan, that, kuti,\n", | |
"nearest to united: of, johanna, managing, cebus, mahogany, akm, congestive, famine,\n", | |
"nearest to may: can, would, could, will, incompatible, cannot, must, procedures,\n", | |
"nearest to it: he, this, there, which, they, tamarin, torment, she,\n", | |
"nearest to as: tamarin, capuchin, agouti, dasyprocta, marmoset, in, prism, by,\n", | |
"nearest to has: had, have, was, is, lough, auchinleck, dcsd, reginae,\n", | |
"nearest to no: a, completely, venerable, trinomial, agave, hijacker, tamarin, victoriae,\n", | |
"nearest to with: between, in, and, from, by, under, agouti, albury,\n", | |
"nearest to s: callithrix, zero, his, and, microcebus, trinomial, reginae, abet,\n", | |
"nearest to on: in, at, cebus, for, tamarin, circ, dasyprocta, iota,\n", | |
"nearest to use: tamarin, agouti, argo, reginae, cebus, albury, rectangle, microsite,\n", | |
"Average loss at step 62000 : 4.79626031196\n", | |
"Average loss at step 64000 : 4.80886316788\n", | |
"Average loss at step 66000 : 4.97172088683\n", | |
"Average loss at step 68000 : 4.92869670045\n", | |
"Average loss at step 70000 : 4.76773341072\n", | |
"nearest to states: reflect, reportedly, tamarin, stalks, transmissions, topological, cebus, prism,\n", | |
"nearest to history: callithrix, reginae, dasyprocta, tamarin, uncertain, hagbard, agouti, arbitrator,\n", | |
"nearest to see: goo, dasyprocta, but, can, sure, barbed, prototype, jellicoe,\n", | |
"nearest to and: or, but, cebus, tamarin, victoriae, callithrix, microcebus, abitibi,\n", | |
"nearest to there: it, they, which, still, now, he, often, psychologically,\n", | |
"nearest to if: when, agouti, isu, then, prevention, joan, however, though,\n", | |
"nearest to united: of, johanna, managing, cebus, famine, akm, mahogany, congestive,\n", | |
"nearest to may: can, would, could, will, must, incompatible, cannot, should,\n", | |
"nearest to it: he, this, there, which, they, she, torment, tamarin,\n", | |
"nearest to as: tamarin, agouti, capuchin, dasyprocta, prism, in, homomorphism, is,\n", | |
"nearest to has: had, have, was, is, lough, reginae, auchinleck, by,\n", | |
"nearest to no: completely, a, venerable, hijacker, trinomial, owing, commemoration, agave,\n", | |
"nearest to with: between, in, from, and, isu, agouti, albury, under,\n", | |
"nearest to s: callithrix, microcebus, zero, and, reginae, jarman, his, trinomial,\n", | |
"nearest to on: in, at, cebus, through, for, from, circ, dasyprocta,\n", | |
"nearest to use: tamarin, agouti, albury, rectangle, reginae, microsite, argo, callithrix,\n", | |
"Average loss at step 72000 : 4.79812862515\n", | |
"Average loss at step 74000 : 4.77556425592\n", | |
"Average loss at step 76000 : 4.86499132544\n", | |
"Average loss at step 78000 : 4.80580271888\n", | |
"Average loss at step 80000 : 4.81581209326\n", | |
"nearest to states: reflect, transmissions, reportedly, tamarin, stalks, topological, cebus, howlin,\n", | |
"nearest to history: callithrix, reginae, dasyprocta, uncertain, hagbard, tamarin, arbitrator, horrors,\n", | |
"nearest to see: goo, but, dasyprocta, barbed, boudin, sure, prototype, anatomical,\n", | |
"nearest to and: or, tamarin, but, cebus, microcebus, victoriae, callithrix, eight,\n", | |
"nearest to there: it, they, he, now, still, which, often, instances,\n", | |
"nearest to if: when, then, agouti, isu, however, though, dist, joan,\n", | |
"nearest to united: johanna, managing, scipio, famine, akm, mahogany, congestive, of,\n", | |
"nearest to may: can, would, could, will, must, should, cannot, might,\n", | |
"nearest to it: he, this, there, which, they, she, tamarin, torment,\n", | |
"nearest to as: tamarin, capuchin, agouti, prism, marmoset, dasyprocta, by, claudel,\n", | |
"nearest to has: had, have, was, is, lough, auchinleck, bomarc, dickson,\n", | |
"nearest to no: completely, venerable, a, hijacker, trinomial, agave, any, microsite,\n", | |
"nearest to with: between, in, from, agouti, isu, and, albury, by,\n", | |
"nearest to s: zero, callithrix, his, abet, microcebus, five, trinomial, microsite,\n", | |
"nearest to on: in, at, through, cebus, from, for, two, circ,\n", | |
"nearest to use: tamarin, agouti, albury, reginae, rectangle, microsite, argo, callithrix,\n", | |
"Average loss at step 82000 : 4.80138861847\n", | |
"Average loss at step 84000 : 4.77657458603\n", | |
"Average loss at step 86000 : 4.74599059129\n", | |
"Average loss at step 88000 : 4.68782997322\n", | |
"Average loss at step 90000 : 4.75792138851\n", | |
"nearest to states: reflect, reportedly, transmissions, tamarin, topological, cebus, stalks, howlin,\n", | |
"nearest to history: callithrix, reginae, dasyprocta, tamarin, arbitrator, hagbard, agouti, horrors,\n", | |
"nearest to see: goo, but, dasyprocta, anatomical, barbed, boudin, can, five,\n", | |
"nearest to and: or, but, tamarin, cebus, microcebus, abitibi, while, cegep,\n", | |
"nearest to there: they, it, he, still, now, often, which, but,\n", | |
"nearest to if: when, then, agouti, though, where, isu, however, is,\n", | |
"nearest to united: johanna, of, managing, scipio, mahogany, akm, congestive, famine,\n", | |
"nearest to may: can, would, could, will, should, must, cannot, might,\n", | |
"nearest to it: he, this, there, she, they, which, tamarin, torment,\n", | |
"nearest to as: tamarin, capuchin, agouti, dasyprocta, or, cegep, prism, when,\n", | |
"nearest to has: had, have, is, was, lough, dickson, but, since,\n", | |
"nearest to no: completely, a, venerable, any, hijacker, trinomial, owing, commemoration,\n", | |
"nearest to with: between, in, from, and, isu, agouti, during, ansgar,\n", | |
"nearest to s: his, callithrix, microcebus, and, abet, references, trinomial, zero,\n", | |
"nearest to on: in, at, cebus, through, upon, for, tamarin, from,\n", | |
"nearest to use: tamarin, agouti, argo, reginae, microsite, albury, callithrix, rectangle,\n", | |
"Average loss at step 92000 : 4.72107960212\n", | |
"Average loss at step 94000 : 4.62920029783\n", | |
"Average loss at step 96000 : 4.72291916287\n", | |
"Average loss at step 98000 : 4.6251190145\n", | |
"Average loss at step 100000 : 4.68731605053\n", | |
"nearest to states: reflect, transmissions, reportedly, topological, tamarin, howlin, cebus, territory,\n", | |
"nearest to history: callithrix, reginae, dasyprocta, arbitrator, uncertain, tamarin, viridian, hagbard,\n", | |
"nearest to see: goo, but, weightings, anatomical, dasyprocta, can, digamma, and,\n", | |
"nearest to and: or, but, tamarin, cebus, while, microcebus, however, callithrix,\n", | |
"nearest to there: they, it, he, now, still, often, which, instances,\n", | |
"nearest to if: when, though, where, then, agouti, isu, however, while,\n", | |
"nearest to united: johanna, managing, scipio, of, mahogany, akm, cebus, congestive,\n", | |
"nearest to may: can, would, could, will, should, must, might, cannot,\n", | |
"nearest to it: he, this, there, she, they, which, tamarin, torment,\n", | |
"nearest to as: tamarin, agouti, capuchin, dasyprocta, marmoset, prism, cegep, victoriae,\n", | |
"nearest to has: had, have, was, is, lough, dickson, cegep, monophysitism,\n", | |
"nearest to no: any, completely, a, venerable, hijacker, trinomial, only, agave,\n", | |
"nearest to with: between, in, from, isu, during, agouti, when, ansgar,\n", | |
"nearest to s: his, callithrix, abet, trinomial, five, microcebus, reginae, and,\n", | |
"nearest to on: in, at, through, upon, cebus, for, dasyprocta, roshan,\n", | |
"nearest to use: tamarin, agouti, reginae, argo, albury, most, microsite, callithrix,\n" | |
] | |
} | |
], | |
"source": [ | |
"# steps to train the model\n", | |
"num_steps = 100001\n", | |
"config = tf.ConfigProto(allow_soft_placement=True)\n", | |
"config.gpu_options.allow_growth = True\n", | |
"\n", | |
"with tf.Session(graph=graph, config=config) as sess:\n", | |
" # we must initialize all variables before using them\n", | |
" # init.run()\n", | |
" sess.run(init)\n", | |
"# tf.global_variables_initializer().run()\n", | |
" print('initialized.')\n", | |
" \n", | |
" # loop through all training steps and keep track of loss\n", | |
" average_loss = 0\n", | |
" for step in range(num_steps):\n", | |
" # generate a minibatch of training data\n", | |
" batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)\n", | |
" feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}\n", | |
" \n", | |
" # we perform a single update step by evaluating the optimizer operation (including it\n", | |
" # in the list of returned values of sess.run())\n", | |
" _, loss_val = sess.run([optimizer, loss], feed_dict=feed_dict)\n", | |
" average_loss += loss_val\n", | |
" \n", | |
" # print average loss every 2,000 steps\n", | |
" if step % 2000 == 0:\n", | |
" if step > 0:\n", | |
" average_loss /= 2000\n", | |
" # the average loss is an estimate of the loss over the last 2000 batches.\n", | |
" print(\"Average loss at step \", step, \": \", average_loss)\n", | |
" average_loss = 0\n", | |
" \n", | |
" # computing cosine similarity (expensive!)\n", | |
" if step % 10000 == 0:\n", | |
" sim = similarity.eval()\n", | |
" for i in range(valid_size):\n", | |
" # get a single validation sample\n", | |
" valid_word = reverse_dictionary[valid_examples[i]]\n", | |
" # number of nearest neighbors\n", | |
" top_k = 8\n", | |
" # computing nearest neighbors\n", | |
" nearest = (-sim[i, :]).argsort()[1:top_k + 1]\n", | |
" log_str = \"nearest to %s:\" % valid_word\n", | |
" for k in range(top_k):\n", | |
" close_word = reverse_dictionary[nearest[k]]\n", | |
" log_str = \"%s %s,\" % (log_str, close_word)\n", | |
" print(log_str)\n", | |
" \n", | |
" final_embeddings = normalized_embeddings.eval()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3(tf)", | |
"language": "python", | |
"name": "python3_tf" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment