Skip to content

Instantly share code, notes, and snippets.

@ThomasDelteil
Last active May 14, 2019 23:31
Show Gist options
  • Save ThomasDelteil/f52349f26701d519b93649e21e62ff77 to your computer and use it in GitHub Desktop.
Save ThomasDelteil/f52349f26701d519b93649e21e62ff77 to your computer and use it in GitHub Desktop.
WorkshopNLP
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
bash
cd SageMaker
sudo yum install htop -y
source activate mxnet_p36
echo "Preparing Face Recognition Lab"
pip install gluoncv
cd ..
git clone https://github.com/THUFutureLab/gluon-face
cd gluon-face/
python3 setup.py install
cd ..
cd SageMaker
git clone https://github.com/ThomasDelteil/mxnet_mtcnn_face_detection FaceRecognition
echo "Preparing Face Detection Lab"
mkdir FaceDetection
cd FaceDetection
wget https://gist.githubusercontent.com/ThomasDelteil/f52349f26701d519b93649e21e62ff77/raw/4d4c81f73382de73d02199b9fe74939b7b49c4e9/face_detection.ipynb
cd ..
echo "Preparing GluonCV lab"
mkdir GluonCV
cd GluonCV
wget https://gist.githubusercontent.com/ThomasDelteil/f52349f26701d519b93649e21e62ff77/raw/48dc108b25ed6d4eaeb0901485c7d2434524f5eb/gluonCV.ipynb
cd ..
echo "Preparing GAN Lab"
pip install pillow
git clone https://gist.github.com/vishaalkapoor/2fcce8981cad4af5cb42eb700974d3cf FaceGenerationGAN
cd FaceGenerationGAN
mkdir dataset output
wget https://s3-us-west-2.amazonaws.com/mxnet-workshop-dropbox/celeba-dataset.zip
cd dataset
unzip ../celeba-dataset.zip
unzip img_align_celeba.zip
rm img*.zip list*.csv
cd ..
cd ..
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Word2Vec Training Japanese"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"import itertools\n",
"import functools\n",
"import time\n",
"import math\n",
"import logging\n",
"import random\n",
"import tarfile\n",
"\n",
"import MeCab\n",
"import mxnet as mx\n",
"from mxnet import gluon\n",
"import gluonnlp as nlp\n",
"import numpy as np\n",
"from scipy import stats"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"mx.test_utils.download('https://wit3.fbk.eu/archive/2017-01-trnted//texts/en/ja/en-ja.tgz')\n",
"tar = tarfile.open(\"en-ja.tgz\", \"r:gz\")\n",
"tar.extractall()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"class JESDataset(gluon.data.Dataset):\n",
" def __init__(self, split='train', filepath='en-ja/train.tags.en-ja'): \n",
" file_ja = filepath+'.ja'\n",
" \n",
" with open(file_ja, 'r') as f:\n",
" japanese_sentences = f.read().splitlines()\n",
" japanese_sentences = [s for s in japanese_sentences if s[0:2] != \" <\" ] # and s[0] != ' <']\n",
" l = len(japanese_sentences)\n",
" self.sentences = japanese_sentences\n",
" self.tagger = MeCab.Tagger('-Owakati')\n",
" \n",
" def __getitem__(self, idx):\n",
" return self.tagger.parse(self.sentences[idx]).replace('\\n','').split(' ')\n",
" \n",
" def __len__(self):\n",
" return len(self.sentences)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"dataset = JESDataset()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"counter = nlp.data.count_tokens(itertools.chain.from_iterable(dataset))"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"vocab = nlp.Vocab(counter, unknown_token=None, padding_token=None,\n",
" bos_token=None, eos_token=None, min_freq=2)\n",
"idx_to_counts = [counter[w] for w in vocab.idx_to_token]"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"# sentences: 223108\n",
"# tokens: 28 [891, 178, 1045, 26, 1398]\n",
"# tokens: 64 [26, 4429, 3, 2, 4973]\n",
"# tokens: 11 [906, 291, 18, 2, 9]\n"
]
}
],
"source": [
"def code(sentence):\n",
" return [vocab[token] for token in sentence if token in vocab]\n",
"\n",
"dataset = dataset.transform(code, lazy=False)\n",
"\n",
"print('# sentences:', len(dataset))\n",
"for sentence in dataset[:3]:\n",
" print('# tokens:', len(sentence), sentence[:5])\n"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"def transform_data_fasttext(data, vocab, idx_to_counts, cbow, ngram_buckets,\n",
" ngrams, batch_size, window_size,\n",
" frequent_token_subsampling=1E-4, dtype='float32',\n",
" index_dtype='int64'):\n",
" \"\"\"Transform a DataStream of coded DataSets to a DataStream of batches.\n",
"\n",
" Parameters\n",
" ----------\n",
" data : gluonnlp.data.DataStream\n",
" DataStream where each sample is a valid input to\n",
" gluonnlp.data.EmbeddingCenterContextBatchify.\n",
" vocab : gluonnlp.Vocab\n",
" Vocabulary containing all tokens whose indices occur in data. For each\n",
" token, it's associated subwords will be computed and used for\n",
" constructing the batches. No subwords are used if ngram_buckets is 0.\n",
" idx_to_counts : list of int\n",
" List of integers such that idx_to_counts[idx] represents the count of\n",
" vocab.idx_to_token[idx] in the underlying dataset. The count\n",
" information is used to subsample frequent words in the dataset.\n",
" Each token is independently dropped with probability 1 - sqrt(t /\n",
" (count / sum_counts)) where t is the hyperparameter\n",
" frequent_token_subsampling.\n",
" cbow : boolean\n",
" If True, batches for CBOW are returned.\n",
" ngram_buckets : int\n",
" Number of hash buckets to consider for the fastText\n",
" nlp.vocab.NGramHashes subword function.\n",
" ngrams : list of int\n",
" For each integer n in the list, all ngrams of length n will be\n",
" considered by the nlp.vocab.NGramHashes subword function.\n",
" batch_size : int\n",
" The returned data stream iterates over batches of batch_size.\n",
" window_size : int\n",
" The context window size for\n",
" gluonnlp.data.EmbeddingCenterContextBatchify.\n",
" frequent_token_subsampling : float\n",
" Hyperparameter for subsampling. See idx_to_counts above for more\n",
" information.\n",
" dtype : str or np.dtype, default 'float32'\n",
" Data type of data array.\n",
" index_dtype : str or np.dtype, default 'int64'\n",
" Data type of index arrays.\n",
"\n",
" Returns\n",
" -------\n",
" gluonnlp.data.DataStream\n",
" Stream over batches. Each returned element is a list corresponding to\n",
" the arguments for the forward pass of model.SG or model.CBOW\n",
" respectively based on if cbow is False or True. If ngarm_buckets > 0,\n",
" the returned sample will contain ngrams. Both model.SG or model.CBOW\n",
" will handle them correctly as long as they are initialized with the\n",
" subword_function returned as second argument by this function (see\n",
" below).\n",
" gluonnlp.vocab.NGramHashes\n",
" The subword_function used for obtaining the subwords in the returned\n",
" batches.\n",
"\n",
" \"\"\"\n",
" if ngram_buckets <= 0:\n",
" raise ValueError('Invalid ngram_buckets. Use Word2Vec training '\n",
" 'pipeline if not interested in ngrams.')\n",
"\n",
" sum_counts = float(sum(idx_to_counts))\n",
" idx_to_pdiscard = [\n",
" 1 - math.sqrt(frequent_token_subsampling / (count / sum_counts))\n",
" for count in idx_to_counts]\n",
"\n",
" def subsample(shard):\n",
" return [[\n",
" t for t, r in zip(sentence,\n",
" np.random.uniform(0, 1, size=len(sentence)))\n",
" if r > idx_to_pdiscard[t]] for sentence in shard]\n",
"\n",
" data = data.transform(subsample)\n",
"\n",
" batchify = nlp.data.batchify.EmbeddingCenterContextBatchify(\n",
" batch_size=batch_size, window_size=window_size, cbow=cbow,\n",
" weight_dtype=dtype, index_dtype=index_dtype)\n",
" data = data.transform(batchify)\n",
"\n",
" subword_function = nlp.vocab.create_subword_function(\n",
" 'NGramHashes', ngrams=ngrams, num_subwords=ngram_buckets)\n",
"\n",
" # Store subword indices for all words in vocabulary\n",
" idx_to_subwordidxs = list(subword_function(vocab.idx_to_token))\n",
" subwordidxs = np.concatenate(idx_to_subwordidxs)\n",
" subwordidxsptr = np.cumsum([\n",
" len(subwordidxs) for subwordidxs in idx_to_subwordidxs])\n",
" subwordidxsptr = np.concatenate([\n",
" np.zeros(1, dtype=np.int64), subwordidxsptr])\n",
" if cbow:\n",
" subword_lookup = functools.partial(\n",
" cbow_lookup, subwordidxs=subwordidxs,\n",
" subwordidxsptr=subwordidxsptr, offset=len(vocab))\n",
" else:\n",
" subword_lookup = functools.partial(\n",
" skipgram_lookup, subwordidxs=subwordidxs,\n",
" subwordidxsptr=subwordidxsptr, offset=len(vocab))\n",
" max_subwordidxs_len = max(len(s) for s in idx_to_subwordidxs)\n",
" if max_subwordidxs_len > 500:\n",
" warnings.warn(\n",
" 'The word with largest number of subwords '\n",
" 'has {} subwords, suggesting there are '\n",
" 'some noisy words in your vocabulary. '\n",
" 'You should filter out very long words '\n",
" 'to avoid memory issues.'.format(max_subwordidxs_len))\n",
"\n",
" data = UnchainStream(data)\n",
"\n",
" if cbow:\n",
" batchify_fn = cbow_fasttext_batch\n",
" else:\n",
" batchify_fn = skipgram_fasttext_batch\n",
" batchify_fn = functools.partial(\n",
" batchify_fn, num_tokens=len(vocab) + len(subword_function),\n",
" subword_lookup=subword_lookup, dtype=dtype, index_dtype=index_dtype)\n",
"\n",
" return data, batchify_fn, subword_function\n",
"\n",
"def skipgram_lookup(indices, subwordidxs, subwordidxsptr, offset=0):\n",
" \"\"\"Get a sparse COO array of words and subwords for SkipGram.\n",
"\n",
" Parameters\n",
" ----------\n",
" indices : numpy.ndarray\n",
" Array containing numbers in [0, vocabulary_size). The element at\n",
" position idx is taken to be the word that occurs at row idx in the\n",
" SkipGram batch.\n",
" offset : int\n",
" Offset to add to each subword index.\n",
" subwordidxs : numpy.ndarray\n",
" Array containing concatenation of all subwords of all tokens in the\n",
" vocabulary, in order of their occurrence in the vocabulary.\n",
" For example np.concatenate(idx_to_subwordidxs)\n",
" subwordidxsptr\n",
" Array containing pointers into subwordidxs array such that\n",
" subwordidxs[subwordidxsptr[i]:subwordidxsptr[i+1]] returns all subwords\n",
" of of token i. For example subwordidxsptr = np.cumsum([\n",
" len(subwordidxs) for subwordidxs in idx_to_subwordidxs])\n",
" offset : int, default 0\n",
" Offset to add to each subword index.\n",
"\n",
" Returns\n",
" -------\n",
" numpy.ndarray of dtype float32\n",
" Array containing weights such that for each row, all weights sum to\n",
" 1. In particular, all elements in a row have weight 1 /\n",
" num_elements_in_the_row\n",
" numpy.ndarray of dtype int64\n",
" This array is the row array of a sparse array of COO format.\n",
" numpy.ndarray of dtype int64\n",
" This array is the col array of a sparse array of COO format.\n",
"\n",
" \"\"\"\n",
" row = []\n",
" col = []\n",
" data = []\n",
" for i, idx in enumerate(indices):\n",
" idx = int(idx)\n",
" start = subwordidxsptr[idx]\n",
" end = subwordidxsptr[idx + 1]\n",
"\n",
" row.append(i)\n",
" col.append(idx)\n",
" data.append(1 / (1 + end - start))\n",
" for subword in subwordidxs[start:end]:\n",
" row.append(i)\n",
" col.append(subword + offset)\n",
" data.append(1 / (1 + end - start))\n",
"\n",
" return (np.array(data, dtype=np.float32), np.array(row, dtype=np.int64),\n",
" np.array(col, dtype=np.int64))\n",
"\n",
"\n",
"def skipgram_batch(centers, contexts, num_tokens, dtype, index_dtype):\n",
" \"\"\"Create a batch for SG training objective.\"\"\"\n",
" contexts = mx.nd.array(contexts[2], dtype=index_dtype)\n",
" indptr = mx.nd.arange(len(centers) + 1)\n",
" centers = mx.nd.array(centers, dtype=index_dtype)\n",
" centers_csr = mx.nd.sparse.csr_matrix(\n",
" (mx.nd.ones(centers.shape), centers, indptr), dtype=dtype,\n",
" shape=(len(centers), num_tokens))\n",
" return centers_csr, contexts, centers\n",
"\n",
"\n",
"class UnchainStream(nlp.data.DataStream):\n",
" def __init__(self, iterable):\n",
" self._stream = iterable\n",
"\n",
" def __iter__(self):\n",
" return iter(itertools.chain.from_iterable(self._stream))\n",
" \n",
"def skipgram_fasttext_batch(centers, contexts, num_tokens, subword_lookup,\n",
" dtype, index_dtype):\n",
" \"\"\"Create a batch for SG training objective with subwords.\"\"\"\n",
" contexts = mx.nd.array(contexts[2], dtype=index_dtype)\n",
" data, row, col = subword_lookup(centers)\n",
" centers = mx.nd.array(centers, dtype=index_dtype)\n",
" centers_csr = mx.nd.sparse.csr_matrix(\n",
" (data, (row, col)), dtype=dtype,\n",
" shape=(len(centers), num_tokens)) # yapf: disable\n",
" return centers_csr, contexts, centers"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"batch_size=4096\n",
"data = nlp.data.SimpleDataStream([dataset]) # input is a stream of datasets, here just 1. Allows scaling to larger corpora that don't fit in memory\n",
"data, batchify_fn, subword_function = transform_data_fasttext(\n",
" data, vocab, idx_to_counts, cbow=False, ngrams=[1,2,3,4], ngram_buckets=100000, batch_size=batch_size, window_size=5)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"batches = data.transform(batchify_fn)"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<の>\t[94239, 20211, 73477, 67233, 87777]\n",
"<>\t[46367, 87777]\n",
"<は>\t[16620, 17878, 95858, 96964, 87777]\n"
]
}
],
"source": [
"idx_to_subwordidxs = subword_function(vocab.idx_to_token)\n",
"for word, subwords in zip(vocab.idx_to_token[:3], idx_to_subwordidxs[:3]):\n",
" print('<'+word+'>', subwords, sep = '\\t')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Network"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
"context = mx.gpu() if mx.context.num_gpus() > 0 else mx.cpu()"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"import mxnet as mx\n",
"import numpy as np\n",
"\n",
"import gluonnlp as nlp\n",
"\n",
"\n",
"class Net(mx.gluon.HybridBlock):\n",
" \"\"\"Base class for word2vec and fastText SkipGram and CBOW networks.\n",
"\n",
" Parameters\n",
" ----------\n",
" token_to_idx : dict\n",
" token_to_idx mapping of the vocabulary that this model is to be trained\n",
" with. token_to_idx is used for __getitem__ and __contains__. For\n",
" len(token_to_idx) is used during initialization to obtain the input_dim\n",
" of the embedding matrix.\n",
" output_dim : int\n",
" Dimension of the dense embedding.\n",
" batch_size : int\n",
" Batchsize this model will be trained with. TODO temporary until\n",
" random_like ops are supported\n",
" negatives_weights : mxnet.nd.NDArray\n",
" Weights for UnigramCandidateSampler for sampling negatives.\n",
" smoothing : float, default 0.75\n",
" Smoothing factor applied to negatives_weights. Final weights are\n",
" mxnet.nd.power(negative_weights, smoothing).\n",
" num_negatives : int, default 5\n",
" Number of negatives to sample for each real sample.\n",
" sparse_grad : bool, default True\n",
" Specifies mxnet.gluon.nn.Embedding sparse_grad argument.\n",
" dtype : str, default 'float32'\n",
" dtype argument passed to gluon.nn.Embedding\n",
"\n",
" \"\"\"\n",
"\n",
" # pylint: disable=abstract-method\n",
" def __init__(self, token_to_idx, output_dim, batch_size, negatives_weights,\n",
" subword_function=None, num_negatives=5, smoothing=0.75,\n",
" sparse_grad=True, dtype='float32', **kwargs):\n",
" super(Net, self).__init__(**kwargs)\n",
"\n",
" self._kwargs = dict(\n",
" input_dim=len(token_to_idx), output_dim=output_dim, dtype=dtype,\n",
" sparse_grad=sparse_grad, num_negatives=num_negatives)\n",
"\n",
" with self.name_scope():\n",
" if subword_function is not None:\n",
" self.embedding = nlp.model.train.FasttextEmbeddingModel(\n",
" token_to_idx=token_to_idx,\n",
" subword_function=subword_function,\n",
" output_dim=output_dim,\n",
" weight_initializer=mx.init.Uniform(scale=1 / output_dim),\n",
" sparse_grad=sparse_grad,\n",
" )\n",
" else:\n",
" self.embedding = nlp.model.train.CSREmbeddingModel(\n",
" token_to_idx=token_to_idx,\n",
" output_dim=output_dim,\n",
" weight_initializer=mx.init.Uniform(scale=1 / output_dim),\n",
" sparse_grad=sparse_grad,\n",
" )\n",
" self.embedding_out = mx.gluon.nn.Embedding(\n",
" len(token_to_idx), output_dim=output_dim,\n",
" weight_initializer=mx.init.Zero(), sparse_grad=sparse_grad,\n",
" dtype=dtype)\n",
"\n",
" self.negatives_sampler = nlp.data.UnigramCandidateSampler(\n",
" weights=negatives_weights**smoothing, shape=(batch_size, ),\n",
" dtype='int64')\n",
"\n",
" def __getitem__(self, tokens):\n",
" return self.embedding[tokens]\n",
"\n",
"\n",
"class SG(Net):\n",
" \"\"\"SkipGram network\"\"\"\n",
"\n",
" # pylint: disable=arguments-differ\n",
" def hybrid_forward(self, F, center, context, center_words):\n",
" \"\"\"SkipGram forward pass.\n",
"\n",
" Parameters\n",
" ----------\n",
" center : mxnet.nd.NDArray or mxnet.sym.Symbol\n",
" Sparse CSR array of word / subword indices of shape (batch_size,\n",
" len(token_to_idx) + num_subwords). Embedding for center words are\n",
" computed via F.sparse.dot between the CSR center array and the\n",
" weight matrix.\n",
" context : mxnet.nd.NDArray or mxnet.sym.Symbol\n",
" Dense array of context words of shape (batch_size, ). Also used for\n",
" row-wise independently masking negatives equal to one of context.\n",
" center_words : mxnet.nd.NDArray or mxnet.sym.Symbol\n",
" Dense array of center words of shape (batch_size, ). Only used for\n",
" row-wise independently masking negatives equal to one of\n",
" center_words.\n",
" \"\"\"\n",
"\n",
" # negatives sampling\n",
" negatives = []\n",
" mask = []\n",
" for _ in range(self._kwargs['num_negatives']):\n",
" negatives.append(self.negatives_sampler(center_words))\n",
" mask_ = negatives[-1] != center_words\n",
" mask_ = F.stack(mask_, (negatives[-1] != context))\n",
" mask.append(mask_.min(axis=0))\n",
"\n",
" negatives = F.stack(*negatives, axis=1)\n",
" mask = F.stack(*mask, axis=1).astype(np.float32)\n",
"\n",
" # center - context pairs\n",
" emb_center = self.embedding(center).expand_dims(1)\n",
" emb_context = self.embedding_out(context).expand_dims(2)\n",
" pred_pos = F.batch_dot(emb_center, emb_context).squeeze()\n",
" loss_pos = (F.relu(pred_pos) - pred_pos + F.Activation(\n",
" -F.abs(pred_pos), act_type='softrelu')) / (mask.sum(axis=1) + 1)\n",
"\n",
" # center - negatives pairs\n",
" emb_negatives = self.embedding_out(negatives).reshape(\n",
" (-1, self._kwargs['num_negatives'],\n",
" self._kwargs['output_dim'])).swapaxes(1, 2)\n",
" pred_neg = F.batch_dot(emb_center, emb_negatives).squeeze()\n",
" mask = mask.reshape((-1, self._kwargs['num_negatives']))\n",
" loss_neg = (F.relu(pred_neg) + F.Activation(\n",
" -F.abs(pred_neg), act_type='softrelu')) * mask\n",
" loss_neg = loss_neg.sum(axis=1) / (mask.sum(axis=1) + 1)\n",
"\n",
" return loss_pos + loss_neg\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SG(\n",
" (embedding): FasttextEmbeddingModel(36590 + 100000 -> 300, float32)\n",
" (embedding_out): Embedding(36590 -> 300, float32)\n",
" (negatives_sampler): UnigramCandidateSampler(36590, int64)\n",
")\n"
]
}
],
"source": [
"emsize = 300\n",
"num_negatives = 5\n",
"\n",
"negatives_weights = mx.nd.array(idx_to_counts)\n",
"embedding = SG(\n",
" vocab.token_to_idx, emsize, batch_size, negatives_weights, subword_function, num_negatives=5, smoothing=0.75)\n",
"embedding.initialize(ctx=context)\n",
"embedding.hybridize()\n",
"trainer = mx.gluon.Trainer(embedding.collect_params(), 'adagrad', dict(learning_rate=0.05))\n",
"\n",
"print(embedding)"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"def norm_vecs_by_row(x):\n",
" return x / (mx.nd.sum(x * x, axis=1) + 1e-10).sqrt().reshape((-1, 1))\n",
"\n",
"\n",
"def get_k_closest_tokens(vocab, embedding, k, word):\n",
" word_vec = norm_vecs_by_row(embedding[[word]])\n",
" vocab_vecs = norm_vecs_by_row(embedding[vocab.idx_to_token])\n",
" dot_prod = mx.nd.dot(vocab_vecs, word_vec.T)\n",
" indices = mx.nd.topk(\n",
" dot_prod.reshape((len(vocab.idx_to_token), )),\n",
" k=k + 1,\n",
" ret_typ='indices')\n",
" indices = [int(i.asscalar()) for i in indices]\n",
" result = [vocab.idx_to_token[i] for i in indices[1:]]\n",
" print('closest tokens to \"%s\": %s' % (word, \", \".join(result)))"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"closest tokens to \"家\": 史家, 家族, 農家, 家賃, 自家, 家屋, 家禽, 人家, 家系, 家政\n"
]
}
],
"source": [
"example_token = \"家\"\n",
"get_k_closest_tokens(vocab, embedding, 10, example_token)"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"log_interval = 500\n",
"\n",
"def train_embedding(num_epochs):\n",
" for epoch in range(1, num_epochs + 1):\n",
" start_time = time.time()\n",
" l_avg = 0\n",
" log_wc = 0\n",
"\n",
" print('Beginnign epoch %d and resampling data.' % epoch)\n",
" for i, batch in enumerate(batches):\n",
" batch = [array.as_in_context(context) for array in batch]\n",
" with mx.autograd.record():\n",
" l = embedding(*batch)\n",
" l.backward()\n",
" trainer.step(1)\n",
"\n",
" l_avg += l.mean()\n",
" log_wc += l.shape[0]\n",
" if i % log_interval == 0:\n",
" mx.nd.waitall()\n",
" wps = log_wc / (time.time() - start_time)\n",
" l_avg = l_avg.asscalar() / log_interval\n",
" print('epoch %d, iteration %d, loss %.2f, throughput=%.2fK wps'\n",
" % (epoch, i, l_avg, wps / 1000))\n",
" start_time = time.time()\n",
" log_wc = 0\n",
" l_avg = 0\n",
"\n",
" get_k_closest_tokens(vocab, embedding, 10, example_token)\n",
" print(\"\")"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Beginnign epoch 1 and resampling data.\n",
"epoch 1, iteration 0, loss 0.00, throughput=1.35K wps\n",
"epoch 1, iteration 500, loss 0.39, throughput=65.55K wps\n",
"epoch 1, iteration 1000, loss 0.39, throughput=65.54K wps\n",
"epoch 1, iteration 1500, loss 0.39, throughput=65.28K wps\n",
"epoch 1, iteration 2000, loss 0.39, throughput=65.07K wps\n",
"closest tokens to \"入力\": 入出力, 出力, 記入, 読み取ら, ピリオド, 点字, CAPTCHA, 一文字, 読み取っ, ダウンロード\n",
"\n",
"Beginnign epoch 2 and resampling data.\n",
"epoch 2, iteration 0, loss 0.00, throughput=1.36K wps\n",
"epoch 2, iteration 500, loss 0.39, throughput=65.89K wps\n",
"epoch 2, iteration 1000, loss 0.39, throughput=66.14K wps\n",
"epoch 2, iteration 1500, loss 0.39, throughput=65.93K wps\n",
"epoch 2, iteration 2000, loss 0.39, throughput=66.04K wps\n",
"closest tokens to \"入力\": 入出力, 出力, 記入, 読み取ら, CAPTCHA, ピリオド, CAPCHA, 点字, 推量, インプット\n",
"\n"
]
}
],
"source": [
"train_embedding(num_epochs=2)"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"closest tokens to \"入力\": 入出力, 出力, 読み取ら, CAPTCHA, 記入, 読み取っ, 送信, ダウンロード, インプット, 読み取れ\n"
]
}
],
"source": [
"example_token = \"入力\"\n",
"get_k_closest_tokens(vocab, embedding, 10, example_token)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "",
"name": ""
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment