vmarkovtsev/id2vec_legacy.ipynb

## id2vec_legacy.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2017-10-06 16:00:22--  https://storage.googleapis.com/models.cdn.sourced.tech/legacy/id2vec_500k.pickle\n",
      "Resolving storage.googleapis.com... 216.58.209.240, 2a00:1450:4007:80f::2010\n",
      "Connecting to storage.googleapis.com|216.58.209.240|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 429378930 (409M) [application/octet-stream]\n",
      "Saving to: ‘id2vec_500k.pickle’\n",
      "\n",
      "id2vec_500k.pickle  100%[===================>] 409.49M  12.2MB/s    in 39s     \n",
      "\n",
      "2017-10-06 16:01:02 (10.5 MB/s) - ‘id2vec_500k.pickle’ saved [429378930/429378930]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!wget https://storage.googleapis.com/models.cdn.sourced.tech/legacy/id2vec_500k.pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Populating the interactive namespace from numpy and matplotlib\n"
     ]
    }
   ],
   "source": [
    "%pylab inline\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with open(\"id2vec_500k.pickle\", \"rb\") as fin:\n",
    "    words, _, embeddings = pickle.load(fin)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "embeddings = array(embeddings)\n",
    "# normalize by L2\n",
    "embeddings = embeddings * (1 / sqrt((embeddings ** 2).sum(axis=1)))[:, newaxis]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def association(w1, w2, w3, n=10):\n",
    "    vec = embeddings[words.index(w1)] - embeddings[words.index(w2)] + embeddings[words.index(w3)]\n",
    "    vec /= sqrt((vec ** 2).sum())\n",
    "    for i in reversed((embeddings.dot(vec)).argsort()[-n:]):\n",
    "        nw = words[i]\n",
    "        if nw not in (w1, w2, w3):\n",
    "            print(nw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "qux\n",
      "quux\n",
      "ffoo\n",
      "wibble\n",
      "afoo\n",
      "pvbm\n",
      "kioku\n",
      "dfoo\n"
     ]
    }
   ],
   "source": [
    "association(\"foo\", \"bar\", \"baz\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def nearest(w, n=10):\n",
    "    for i in reversed((embeddings.dot(embeddings[words.index(w)])).argsort()[-n:]):\n",
    "        nw = words[i]\n",
    "        if nw != w:\n",
    "            print(nw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dutch\n",
      "italian\n",
      "danish\n",
      "german\n",
      "finnish\n",
      "russian\n",
      "swedish\n",
      "french\n",
      "nglish\n"
     ]
    }
   ],
   "source": [
    "nearest(\"english\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"--2017-10-06 16:00:22-- https://storage.googleapis.com/models.cdn.sourced.tech/legacy/id2vec_500k.pickle\n",
	"Resolving storage.googleapis.com... 216.58.209.240, 2a00:1450:4007:80f::2010\n",
	"Connecting to storage.googleapis.com\|216.58.209.240\|:443... connected.\n",
	"HTTP request sent, awaiting response... 200 OK\n",
	"Length: 429378930 (409M) [application/octet-stream]\n",
	"Saving to: ‘id2vec_500k.pickle’\n",
	"\n",
	"id2vec_500k.pickle 100%[===================>] 409.49M 12.2MB/s in 39s \n",
	"\n",
	"2017-10-06 16:01:02 (10.5 MB/s) - ‘id2vec_500k.pickle’ saved [429378930/429378930]\n",
	"\n"
	]
	}
	],
	"source": [
	"!wget https://storage.googleapis.com/models.cdn.sourced.tech/legacy/id2vec_500k.pickle"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Populating the interactive namespace from numpy and matplotlib\n"
	]
	}
	],
	"source": [
	"%pylab inline\n",
	"import pickle"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"with open(\"id2vec_500k.pickle\", \"rb\") as fin:\n",
	" words, _, embeddings = pickle.load(fin)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"embeddings = array(embeddings)\n",
	"# normalize by L2\n",
	"embeddings = embeddings * (1 / sqrt((embeddings ** 2).sum(axis=1)))[:, newaxis]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [],
	"source": [
	"def association(w1, w2, w3, n=10):\n",
	" vec = embeddings[words.index(w1)] - embeddings[words.index(w2)] + embeddings[words.index(w3)]\n",
	" vec /= sqrt((vec ** 2).sum())\n",
	" for i in reversed((embeddings.dot(vec)).argsort()[-n:]):\n",
	" nw = words[i]\n",
	" if nw not in (w1, w2, w3):\n",
	" print(nw)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"qux\n",
	"quux\n",
	"ffoo\n",
	"wibble\n",
	"afoo\n",
	"pvbm\n",
	"kioku\n",
	"dfoo\n"
	]
	}
	],
	"source": [
	"association(\"foo\", \"bar\", \"baz\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def nearest(w, n=10):\n",
	" for i in reversed((embeddings.dot(embeddings[words.index(w)])).argsort()[-n:]):\n",
	" nw = words[i]\n",
	" if nw != w:\n",
	" print(nw)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"dutch\n",
	"italian\n",
	"danish\n",
	"german\n",
	"finnish\n",
	"russian\n",
	"swedish\n",
	"french\n",
	"nglish\n"
	]
	}
	],
	"source": [
	"nearest(\"english\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}