Skip to content

Instantly share code, notes, and snippets.

@vmarkovtsev
Created October 6, 2017 14:11
Show Gist options
  • Save vmarkovtsev/cc50b5c2de17e574f59dfe706a39a290 to your computer and use it in GitHub Desktop.
Save vmarkovtsev/cc50b5c2de17e574f59dfe706a39a290 to your computer and use it in GitHub Desktop.
Source code identifier embeddings - legacy demonstration
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2017-10-06 16:00:22-- https://storage.googleapis.com/models.cdn.sourced.tech/legacy/id2vec_500k.pickle\n",
"Resolving storage.googleapis.com... 216.58.209.240, 2a00:1450:4007:80f::2010\n",
"Connecting to storage.googleapis.com|216.58.209.240|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 429378930 (409M) [application/octet-stream]\n",
"Saving to: ‘id2vec_500k.pickle’\n",
"\n",
"id2vec_500k.pickle 100%[===================>] 409.49M 12.2MB/s in 39s \n",
"\n",
"2017-10-06 16:01:02 (10.5 MB/s) - ‘id2vec_500k.pickle’ saved [429378930/429378930]\n",
"\n"
]
}
],
"source": [
"!wget https://storage.googleapis.com/models.cdn.sourced.tech/legacy/id2vec_500k.pickle"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
}
],
"source": [
"%pylab inline\n",
"import pickle"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"with open(\"id2vec_500k.pickle\", \"rb\") as fin:\n",
" words, _, embeddings = pickle.load(fin)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"embeddings = array(embeddings)\n",
"# normalize by L2\n",
"embeddings = embeddings * (1 / sqrt((embeddings ** 2).sum(axis=1)))[:, newaxis]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def association(w1, w2, w3, n=10):\n",
" vec = embeddings[words.index(w1)] - embeddings[words.index(w2)] + embeddings[words.index(w3)]\n",
" vec /= sqrt((vec ** 2).sum())\n",
" for i in reversed((embeddings.dot(vec)).argsort()[-n:]):\n",
" nw = words[i]\n",
" if nw not in (w1, w2, w3):\n",
" print(nw)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"qux\n",
"quux\n",
"ffoo\n",
"wibble\n",
"afoo\n",
"pvbm\n",
"kioku\n",
"dfoo\n"
]
}
],
"source": [
"association(\"foo\", \"bar\", \"baz\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def nearest(w, n=10):\n",
" for i in reversed((embeddings.dot(embeddings[words.index(w)])).argsort()[-n:]):\n",
" nw = words[i]\n",
" if nw != w:\n",
" print(nw)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"dutch\n",
"italian\n",
"danish\n",
"german\n",
"finnish\n",
"russian\n",
"swedish\n",
"french\n",
"nglish\n"
]
}
],
"source": [
"nearest(\"english\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment