aboSamoor/Embeddings Tutorial

## Embeddings Tutorial
{
 "metadata": {
  "name": "EmbeddingsTutorial"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Loading Embeddings\n",
      "\n",
      "The models are pickled as a python tuple.\n",
      "\n",
      "The first element of the pair is a python list. The list represents the vocabulary words sorted by their frequency in the corpus.\n",
      "\n",
      "The second element is a python numpy array where each row represents a word vector. The embeddings are stored as 32 bit float numpy array to save space."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import pickle\n",
      "import numpy\n",
      "words, embeddings = pickle.load(open('/home/polyglot/en/words_embeddings_32.pkl', 'rb'))\n",
      "print(\"Emebddings shape is {}\".format(embeddings.shape))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Emebddings shape is (100004, 64)\n"
       ]
      }
     ],
     "prompt_number": 25
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "The size of the emebddings tells about the number of words that make up the vocabulary (100004) and the size of the vector that represent each word (64).\n",
      "\n",
      "The vocabulary consist of 100004 words. The first four words are special symbols:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "Token_ID = {\"<UNK>\": 0, \"<S>\": 1, \"</S>\":2, \"<PAD>\": 3}\n",
      "#{<UNK>: Out of vocabulary word,\n",
      "# <S>: Start of sentence,\n",
      "# </S>: End of sentence,\n",
      "# <PAD>: Padding character}."
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 45
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "If your sentence is \"I visited New York .\", then the model was trained on the following five 5-grams:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "((\"<PAD>\", \"<S>\", \"I\", \"visited\", \"New\"),\n",
      " (\"<S>\", \"I\", \"visited\", \"New\", \"York\"),\n",
      " (\"I\", \"visited\", \"New\", \"York\", \".\"),\n",
      " (\"visited\", \"New\", \"York\", \".\" \"</S>\"),\n",
      " (\"New\", \"York\", \".\", \"</S>\", \"<PAD>\"))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 48
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# First ten words in the vocabulary. Notice the special symbols <UNK> ... <PAD>\n",
      "words[:10]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 19,
       "text": [
        "(u'<UNK>', u'<S>', u'</S>', u'<PAD>', u',', u'the', u'.', u'of', u'and', u'in')"
       ]
      }
     ],
     "prompt_number": 19
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# A tuple of the word \"outside\" and its representation as a python array\n",
      "words[777], embeddings[777]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 26,
       "text": [
        "(u'outside',\n",
        " array([-0.30594289, -0.10619531,  0.22014943, -0.0418048 , -0.25563559,\n",
        "       -0.22289647,  0.24415126, -0.42394561, -0.29082915,  0.2757107 ,\n",
        "       -0.01486778, -0.82750046, -0.46192446,  0.02731112,  0.36313367,\n",
        "        0.02308739,  0.44220203,  0.63888663, -0.75270784,  0.34825927,\n",
        "        0.33574232,  0.13333255, -1.27148712, -0.17001058, -0.94983661,\n",
        "       -0.02366572,  0.58226883, -0.73669076,  0.20364907,  0.53477538,\n",
        "       -0.11396599, -0.22912201, -0.18428923,  0.57168871,  0.70096195,\n",
        "       -0.01094483, -0.10256457,  0.23729944,  0.16012612, -0.08789989,\n",
        "        0.32947737, -0.19176106, -0.40289786,  0.2634418 ,  0.22998494,\n",
        "        0.14719962, -0.03886349, -0.1285357 , -0.05806407,  0.19683087,\n",
        "        0.59862757, -0.15636708, -0.53672892,  0.23510239, -0.34235647,\n",
        "       -0.4950844 , -0.29466859,  1.062222  , -0.15154035,  0.22687389,\n",
        "        0.34555328, -0.44103339,  0.43293494, -0.10873429], dtype=float32))"
       ]
      }
     ],
     "prompt_number": 26
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "K-Nearest Neighbors Example"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "\"\"\"KNN Example.\"\"\"\n",
      "\n",
      "from operator import itemgetter\n",
      "from itertools import izip, islice\n",
      "import re\n",
      "import numpy\n",
      "\n",
      "# Special tokens\n",
      "Token_ID = {\"<UNK>\": 0, \"<S>\": 1, \"</S>\":2, \"<PAD>\": 3}\n",
      "ID_Token = {v:k for k,v in Token_ID.iteritems()}\n",
      "\n",
      "# Map words to indices and vice versa\n",
      "word_id = {w:i for (i, w) in enumerate(words)}\n",
      "id_word = dict(enumerate(words))\n",
      "\n",
      "# Noramlize digits by replacing them with #\n",
      "DIGITS = re.compile(\"[0-9]\", re.UNICODE)\n",
      "\n",
      "# Number of neighbors to return.\n",
      "k = 5\n",
      "\n",
      "\n",
      "def case_normalizer(word, dictionary):\n",
      "  \"\"\" In case the word is not available in the vocabulary,\n",
      "     we can try multiple case normalizing procedure.\n",
      "     We consider the best substitute to be the one with the lowest index,\n",
      "     which is equivalent to the most frequent alternative.\"\"\"\n",
      "  w = word\n",
      "  lower = (dictionary.get(w.lower(), 1e12), w.lower())\n",
      "  upper = (dictionary.get(w.upper(), 1e12), w.upper())\n",
      "  title = (dictionary.get(w.title(), 1e12), w.title())\n",
      "  results = [lower, upper, title]\n",
      "  results.sort()\n",
      "  index, w = results[0]\n",
      "  if index != 1e12:\n",
      "    return w\n",
      "  return word\n",
      "\n",
      "\n",
      "def normalize(word, word_id):\n",
      "    \"\"\" Find the closest alternative in case the word is OOV.\"\"\"\n",
      "    if not word in word_id:\n",
      "        word = DIGITS.sub(\"#\", word)\n",
      "    if not word in word_id:\n",
      "        word = case_normalizer(word, word_id)\n",
      "\n",
      "    if not word in word_id:\n",
      "        return None\n",
      "    return word\n",
      "\n",
      "\n",
      "def l2_nearest(embeddings, word_index, k):\n",
      "    \"\"\"Sorts words according to their Euclidean distance.\n",
      "       To use cosine distance, embeddings has to be normalized so that their l2 norm is 1.\"\"\"\n",
      "\n",
      "    e = embeddings[word_index]\n",
      "    distances = (((embeddings - e) ** 2).sum(axis=1) ** 0.5)\n",
      "    sorted_distances = sorted(enumerate(distances), key=itemgetter(1))\n",
      "    return zip(*sorted_distances[:k])\n",
      "\n",
      "\n",
      "def knn(word, embeddings, word_id, id_word):\n",
      "    word = normalize(word, word_id)\n",
      "    if not word:\n",
      "        print(\"OOV word\")\n",
      "        return\n",
      "    word_index = word_id[word]\n",
      "    indices, distances = l2_nearest(embeddings, word_index, k)\n",
      "    neighbors = [id_word[idx] for idx in indices]\n",
      "    for i, (word, distance) in enumerate(izip(neighbors, distances)):\n",
      "      print i, '\\t', word, '\\t\\t', distance\n",
      "\n",
      "knn(\"Jordan\", embeddings, word_id, id_word)\n",
      "print\n",
      "knn(\"1986\", embeddings, word_id, id_word)\n",
      "print\n",
      "knn(\"JAPAN\", embeddings, word_id, id_word)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0 \tJordan \t\t0.0\n",
        "1 \tHolland \t\t1.47199\n",
        "2 \tLucas \t\t1.55305\n",
        "3 \tMarshall \t\t1.58405\n",
        "4 \tNelson \t\t1.58547\n",
        "\n",
        "0"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " \t#### \t\t0.0\n",
        "1 \t####EN#### \t\t2.66805\n",
        "2 \t## \t\t2.8479\n",
        "3 \t####EN## \t\t2.88584\n",
        "4 \t# \t\t3.05274\n",
        "\n",
        "0"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " \tJapan \t\t0.0\n",
        "1 \tChina \t\t1.38575\n",
        "2 \tMexico \t\t1.45689\n",
        "3 \tEurope \t\t1.50911\n",
        "4 \tBrazil \t\t1.52698\n"
       ]
      }
     ],
     "prompt_number": 49
    }
   ],
   "metadata": {}
  }
 ]
}
	{
	"metadata": {
	"name": "EmbeddingsTutorial"
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Loading Embeddings\n",
	"\n",
	"The models are pickled as a python tuple.\n",
	"\n",
	"The first element of the pair is a python list. The list represents the vocabulary words sorted by their frequency in the corpus.\n",
	"\n",
	"The second element is a python numpy array where each row represents a word vector. The embeddings are stored as 32 bit float numpy array to save space."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import pickle\n",
	"import numpy\n",
	"words, embeddings = pickle.load(open('/home/polyglot/en/words_embeddings_32.pkl', 'rb'))\n",
	"print(\"Emebddings shape is {}\".format(embeddings.shape))"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"Emebddings shape is (100004, 64)\n"
	]
	}
	],
	"prompt_number": 25
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"The size of the emebddings tells about the number of words that make up the vocabulary (100004) and the size of the vector that represent each word (64).\n",
	"\n",
	"The vocabulary consist of 100004 words. The first four words are special symbols:"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"Token_ID = {\"<UNK>\": 0, \"<S>\": 1, \"</S>\":2, \"<PAD>\": 3}\n",
	"#{<UNK>: Out of vocabulary word,\n",
	"# <S>: Start of sentence,\n",
	"# </S>: End of sentence,\n",
	"# <PAD>: Padding character}."
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 45
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"If your sentence is \"I visited New York .\", then the model was trained on the following five 5-grams:"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"((\"<PAD>\", \"<S>\", \"I\", \"visited\", \"New\"),\n",
	" (\"<S>\", \"I\", \"visited\", \"New\", \"York\"),\n",
	" (\"I\", \"visited\", \"New\", \"York\", \".\"),\n",
	" (\"visited\", \"New\", \"York\", \".\" \"</S>\"),\n",
	" (\"New\", \"York\", \".\", \"</S>\", \"<PAD>\"))"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 48
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# First ten words in the vocabulary. Notice the special symbols <UNK> ... <PAD>\n",
	"words[:10]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 19,
	"text": [
	"(u'<UNK>', u'<S>', u'</S>', u'<PAD>', u',', u'the', u'.', u'of', u'and', u'in')"
	]
	}
	],
	"prompt_number": 19
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# A tuple of the word \"outside\" and its representation as a python array\n",
	"words[777], embeddings[777]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 26,
	"text": [
	"(u'outside',\n",
	" array([-0.30594289, -0.10619531, 0.22014943, -0.0418048 , -0.25563559,\n",
	" -0.22289647, 0.24415126, -0.42394561, -0.29082915, 0.2757107 ,\n",
	" -0.01486778, -0.82750046, -0.46192446, 0.02731112, 0.36313367,\n",
	" 0.02308739, 0.44220203, 0.63888663, -0.75270784, 0.34825927,\n",
	" 0.33574232, 0.13333255, -1.27148712, -0.17001058, -0.94983661,\n",
	" -0.02366572, 0.58226883, -0.73669076, 0.20364907, 0.53477538,\n",
	" -0.11396599, -0.22912201, -0.18428923, 0.57168871, 0.70096195,\n",
	" -0.01094483, -0.10256457, 0.23729944, 0.16012612, -0.08789989,\n",
	" 0.32947737, -0.19176106, -0.40289786, 0.2634418 , 0.22998494,\n",
	" 0.14719962, -0.03886349, -0.1285357 , -0.05806407, 0.19683087,\n",
	" 0.59862757, -0.15636708, -0.53672892, 0.23510239, -0.34235647,\n",
	" -0.4950844 , -0.29466859, 1.062222 , -0.15154035, 0.22687389,\n",
	" 0.34555328, -0.44103339, 0.43293494, -0.10873429], dtype=float32))"
	]
	}
	],
	"prompt_number": 26
	},
	{
	"cell_type": "heading",
	"level": 1,
	"metadata": {},
	"source": [
	"K-Nearest Neighbors Example"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"\"\"\"KNN Example.\"\"\"\n",
	"\n",
	"from operator import itemgetter\n",
	"from itertools import izip, islice\n",
	"import re\n",
	"import numpy\n",
	"\n",
	"# Special tokens\n",
	"Token_ID = {\"<UNK>\": 0, \"<S>\": 1, \"</S>\":2, \"<PAD>\": 3}\n",
	"ID_Token = {v:k for k,v in Token_ID.iteritems()}\n",
	"\n",
	"# Map words to indices and vice versa\n",
	"word_id = {w:i for (i, w) in enumerate(words)}\n",
	"id_word = dict(enumerate(words))\n",
	"\n",
	"# Noramlize digits by replacing them with #\n",
	"DIGITS = re.compile(\"[0-9]\", re.UNICODE)\n",
	"\n",
	"# Number of neighbors to return.\n",
	"k = 5\n",
	"\n",
	"\n",
	"def case_normalizer(word, dictionary):\n",
	" \"\"\" In case the word is not available in the vocabulary,\n",
	" we can try multiple case normalizing procedure.\n",
	" We consider the best substitute to be the one with the lowest index,\n",
	" which is equivalent to the most frequent alternative.\"\"\"\n",
	" w = word\n",
	" lower = (dictionary.get(w.lower(), 1e12), w.lower())\n",
	" upper = (dictionary.get(w.upper(), 1e12), w.upper())\n",
	" title = (dictionary.get(w.title(), 1e12), w.title())\n",
	" results = [lower, upper, title]\n",
	" results.sort()\n",
	" index, w = results[0]\n",
	" if index != 1e12:\n",
	" return w\n",
	" return word\n",
	"\n",
	"\n",
	"def normalize(word, word_id):\n",
	" \"\"\" Find the closest alternative in case the word is OOV.\"\"\"\n",
	" if not word in word_id:\n",
	" word = DIGITS.sub(\"#\", word)\n",
	" if not word in word_id:\n",
	" word = case_normalizer(word, word_id)\n",
	"\n",
	" if not word in word_id:\n",
	" return None\n",
	" return word\n",
	"\n",
	"\n",
	"def l2_nearest(embeddings, word_index, k):\n",
	" \"\"\"Sorts words according to their Euclidean distance.\n",
	" To use cosine distance, embeddings has to be normalized so that their l2 norm is 1.\"\"\"\n",
	"\n",
	" e = embeddings[word_index]\n",
	" distances = (((embeddings - e) 2).sum(axis=1) 0.5)\n",
	" sorted_distances = sorted(enumerate(distances), key=itemgetter(1))\n",
	" return zip(*sorted_distances[:k])\n",
	"\n",
	"\n",
	"def knn(word, embeddings, word_id, id_word):\n",
	" word = normalize(word, word_id)\n",
	" if not word:\n",
	" print(\"OOV word\")\n",
	" return\n",
	" word_index = word_id[word]\n",
	" indices, distances = l2_nearest(embeddings, word_index, k)\n",
	" neighbors = [id_word[idx] for idx in indices]\n",
	" for i, (word, distance) in enumerate(izip(neighbors, distances)):\n",
	" print i, '\\t', word, '\\t\\t', distance\n",
	"\n",
	"knn(\"Jordan\", embeddings, word_id, id_word)\n",
	"print\n",
	"knn(\"1986\", embeddings, word_id, id_word)\n",
	"print\n",
	"knn(\"JAPAN\", embeddings, word_id, id_word)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"0 \tJordan \t\t0.0\n",
	"1 \tHolland \t\t1.47199\n",
	"2 \tLucas \t\t1.55305\n",
	"3 \tMarshall \t\t1.58405\n",
	"4 \tNelson \t\t1.58547\n",
	"\n",
	"0"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	" \t#### \t\t0.0\n",
	"1 \t####EN#### \t\t2.66805\n",
	"2 \t## \t\t2.8479\n",
	"3 \t####EN## \t\t2.88584\n",
	"4 \t# \t\t3.05274\n",
	"\n",
	"0"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	" \tJapan \t\t0.0\n",
	"1 \tChina \t\t1.38575\n",
	"2 \tMexico \t\t1.45689\n",
	"3 \tEurope \t\t1.50911\n",
	"4 \tBrazil \t\t1.52698\n"
	]
	}
	],
	"prompt_number": 49
	}
	],
	"metadata": {}
	}
	]
	}