Skip to content

Instantly share code, notes, and snippets.

@sushantMoon
Created October 27, 2019 14:07
Show Gist options
  • Save sushantMoon/57f31738b730a4b5cbf28a5d3c9a9b2e to your computer and use it in GitHub Desktop.
Save sushantMoon/57f31738b730a4b5cbf28a5d3c9a9b2e to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import numpy as np\n",
"import pandas as pd\n",
"import tensorflow as tf\n",
"import tensorflow_hub as hub\n",
"from gensim.scripts.glove2word2vec import glove2word2vec\n",
"from gensim.models import KeyedVectors\n",
"from sklearn.decomposition import PCA\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Examples that we would work on"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"sentence_1 = \"This is Mr. River's Piggy bank.\"\n",
"sentence_2 = \"This Mr. Piggys is river bank.\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['this is mr rivers piggy bank', 'this mr piggys is river bank']\n"
]
}
],
"source": [
"# Cleaning the sentences, removing . and ,\n",
"sentence_1 = sentence_1.replace('.', '').replace(\"'\", '').lower()\n",
"sentence_2 = sentence_2.replace('.', '').lower()\n",
"sentences = [sentence_1, sentence_2]\n",
"print(sentences)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# GloVe Embedding"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Converting GloVe vectors in the format of word2vec so that gensim can be used to load them. "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2.11 s, sys: 5.91 s, total: 8.02 s\n",
"Wall time: 18.9 s\n"
]
},
{
"data": {
"text/plain": [
"(1917494, 300)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"glove_vector_file = 'glove.42B.300d.txt'\n",
"glove_word2vec = 'glove.word2vec'\n",
"glove2word2vec(glove_vector_file, glove_word2vec)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 10min 28s, sys: 4.8 s, total: 10min 33s\n",
"Wall time: 10min 33s\n"
]
}
],
"source": [
"%%time\n",
"model = KeyedVectors.load_word2vec_format(glove_word2vec, binary=False)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Calculating the word vectors for each word\n",
"vector_1 = np.array([model[x] for x in sentence_1.split()])\n",
"vector_2 = np.array([model[x] for x in sentence_2.split()])\n",
"\n",
"# Averaging the word vector to get the sentence vector\n",
"glove_x = [\n",
" sum(vector_1)/len(sentence_1.split()),\n",
" sum(vector_2)/len(sentence_2.split())\n",
"]\n",
"glove_y = PCA(n_components=2).fit_transform(glove_x)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ELMo Embeddings"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n",
"CPU times: user 5.18 s, sys: 552 ms, total: 5.74 s\n",
"Wall time: 2.17 s\n"
]
}
],
"source": [
"%%time\n",
"# url = \"https://tfhub.dev/google/elmo/3\" Downloaded the model\n",
"graph = tf.Graph()\n",
"graph.device(\"/cpu\")\n",
"\n",
"with graph.as_default():\n",
" elmo_model = hub.Module(\"elmov3/\", trainable=False)\n",
" embeddings = elmo_model(\n",
" sentences,\n",
" signature=\"default\",\n",
" as_dict=True\n",
" )[\"default\"]\n",
" with tf.Session() as sess:\n",
" sess.run(tf.global_variables_initializer())\n",
" sess.run(tf.tables_initializer())\n",
" elmo_x = sess.run(embeddings)\n",
"elmo_y = PCA(n_components=2).fit_transform(elmo_x)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Plotting the embeddings"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.xlabel(\"X\")\n",
"plt.ylabel(\"Y\")\n",
"plt.title(\"Embeddings\")\n",
"for i in range(len(glove_y)):\n",
" plt.scatter(glove_y[i][0], glove_y[i][1], label=sentences[i]+'(Glove Embedding)', color='red')\n",
"for i in range(len(elmo_y)):\n",
" plt.scatter(elmo_y[i][0], elmo_y[i][1], label=sentences[i]+'(ELMo Embedding)', color='green')\n",
"\n",
"plt.legend()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Inferences\n",
"\n",
"1. Contextual Word Embedding would work better when faced with scenario where there may be two or more meaning of a word depending on the context it is being used.\n",
"2. The ouput from contextual word embedding model would then be used for creating the classification model to get better results. "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment