OverPoweredDev/cbow.ipynb

## cbow.ipynb
{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/OverPoweredDev/fc42001de01a4d5d087c6ba6cf6c18b8/cbow.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "2b4f5fe3",
      "metadata": {
        "id": "2b4f5fe3"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import keras.backend as K\n",
        "from keras.models import Sequential\n",
        "from keras.layers import Dense, Embedding, Lambda\n",
        "from keras.utils import np_utils\n",
        "from keras.preprocessing import sequence\n",
        "from keras.preprocessing.text import Tokenizer\n",
        "import gensim\n",
        "import nltk"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "ed0fd15c",
      "metadata": {
        "id": "ed0fd15c"
      },
      "outputs": [],
      "source": [
        "data=open('text','r')\n",
        "corona_data = [text for text in data if text.count(' ') >= 2]\n",
        "vectorize = Tokenizer()\n",
        "vectorize.fit_on_texts(corona_data)\n",
        "corona_data = vectorize.texts_to_sequences(corona_data)\n",
        "total_vocab = sum(len(s) for s in corona_data)\n",
        "word_count = len(vectorize.word_index) + 1\n",
        "window_size = 2"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "8f0eb51a",
      "metadata": {
        "id": "8f0eb51a"
      },
      "outputs": [],
      "source": [
        "def get_cosine_sim(A,B):\n",
        "    return np.dot(A,B)/(np.linalg.norm(A)*np.linalg.norm(B))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "17cb7fb1",
      "metadata": {
        "id": "17cb7fb1"
      },
      "outputs": [],
      "source": [
        "def cbow_model(data, window_size, total_vocab):\n",
        "    total_length = window_size*2\n",
        "    for text in data:\n",
        "        text_len = len(text)\n",
        "        for idx, word in enumerate(text):\n",
        "            context_word = []\n",
        "            target   = []            \n",
        "            begin = idx - window_size\n",
        "            end = idx + window_size + 1\n",
        "            context_word.append([text[i] for i in range(begin, end) if 0 <= i < text_len and i != idx])\n",
        "            target.append(word)\n",
        "            contextual = sequence.pad_sequences(context_word, total_length=total_length)\n",
        "            final_target = np_utils.to_categorical(target, total_vocab)\n",
        "            yield(contextual, final_target) \n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "91502185",
      "metadata": {
        "id": "91502185"
      },
      "outputs": [],
      "source": [
        "model = Sequential()\n",
        "model.add(Embedding(input_dim=total_vocab, output_dim=100, input_length=window_size*2))\n",
        "model.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(100,)))\n",
        "model.add(Dense(total_vocab, activation='softmax'))\n",
        "model.compile(loss='categorical_crossentropy', optimizer='adam')\n",
        "\n",
        "for i in range(10):\n",
        "    cost = 0\n",
        "    for x, y in cbow_model(data, window_size, total_vocab):\n",
        "        cost += model.train_on_batch(contextual, final_target)\n",
        "    print(i, cost)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "ad7fb11d",
      "metadata": {
        "id": "ad7fb11d"
      },
      "outputs": [],
      "source": [
        "dimensions=100\n",
        "vect_file = open('vectors.txt' ,'w')\n",
        "vect_file.write('{} {}\\n'.format(total_vocab,dimensions))\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "7dc257bc",
      "metadata": {
        "id": "7dc257bc"
      },
      "outputs": [],
      "source": [
        "weights = model.get_weights()[0]\n",
        "\n",
        "word2vec = {}\n",
        "\n",
        "for text, i in vectorize.word_index.items():\n",
        "#     print(text)\n",
        "    word2vec[text] = weights[i, :]\n",
        "#     final_vec = ' '.join(map(str, list(weights[i, :])))\n",
        "#     vect_file.write('{} {}\\n'.format(text, final_vec))\n",
        "# vect_file.close()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "4c7645a9",
      "metadata": {
        "id": "4c7645a9"
      },
      "outputs": [],
      "source": [
        "eq = word2vec[\"king\"]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "95dccf9b",
      "metadata": {
        "id": "95dccf9b",
        "outputId": "01140e77-dc61-425b-9b44-e013e02cba2d",
        "colab": {
          "base_uri": "https://localhost:8080/"
        }
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[['sovereign', -0.3292683],\n",
              " ['four', -0.2554933],\n",
              " ['princess', -0.24286719],\n",
              " ['model', -0.22898991],\n",
              " ['mesoamerica', -0.21277258]]"
            ]
          },
          "metadata": {},
          "execution_count": 40
        }
      ],
      "source": [
        "min_dist = 1000\n",
        "min_words = []\n",
        "for word in word2vec:\n",
        "    dist = get_cosine_sim(eq, word2vec[word])\n",
        "    min_words.append([word, dist])\n",
        "sorted(min_words, key= lambda w:w[1])[:5]"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3 (ipykernel)",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8.10"
    },
    "colab": {
      "provenance": [],
      "name": "Continuous Bag of Words",
      "include_colab_link": true
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/OverPoweredDev/fc42001de01a4d5d087c6ba6cf6c18b8/cbow.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "2b4f5fe3",
	"metadata": {
	"id": "2b4f5fe3"
	},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import keras.backend as K\n",
	"from keras.models import Sequential\n",
	"from keras.layers import Dense, Embedding, Lambda\n",
	"from keras.utils import np_utils\n",
	"from keras.preprocessing import sequence\n",
	"from keras.preprocessing.text import Tokenizer\n",
	"import gensim\n",
	"import nltk"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "ed0fd15c",
	"metadata": {
	"id": "ed0fd15c"
	},
	"outputs": [],
	"source": [
	"data=open('text','r')\n",
	"corona_data = [text for text in data if text.count(' ') >= 2]\n",
	"vectorize = Tokenizer()\n",
	"vectorize.fit_on_texts(corona_data)\n",
	"corona_data = vectorize.texts_to_sequences(corona_data)\n",
	"total_vocab = sum(len(s) for s in corona_data)\n",
	"word_count = len(vectorize.word_index) + 1\n",
	"window_size = 2"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "8f0eb51a",
	"metadata": {
	"id": "8f0eb51a"
	},
	"outputs": [],
	"source": [
	"def get_cosine_sim(A,B):\n",
	" return np.dot(A,B)/(np.linalg.norm(A)*np.linalg.norm(B))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "17cb7fb1",
	"metadata": {
	"id": "17cb7fb1"
	},
	"outputs": [],
	"source": [
	"def cbow_model(data, window_size, total_vocab):\n",
	" total_length = window_size*2\n",
	" for text in data:\n",
	" text_len = len(text)\n",
	" for idx, word in enumerate(text):\n",
	" context_word = []\n",
	" target = [] \n",
	" begin = idx - window_size\n",
	" end = idx + window_size + 1\n",
	" context_word.append([text[i] for i in range(begin, end) if 0 <= i < text_len and i != idx])\n",
	" target.append(word)\n",
	" contextual = sequence.pad_sequences(context_word, total_length=total_length)\n",
	" final_target = np_utils.to_categorical(target, total_vocab)\n",
	" yield(contextual, final_target) \n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "91502185",
	"metadata": {
	"id": "91502185"
	},
	"outputs": [],
	"source": [
	"model = Sequential()\n",
	"model.add(Embedding(input_dim=total_vocab, output_dim=100, input_length=window_size*2))\n",
	"model.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(100,)))\n",
	"model.add(Dense(total_vocab, activation='softmax'))\n",
	"model.compile(loss='categorical_crossentropy', optimizer='adam')\n",
	"\n",
	"for i in range(10):\n",
	" cost = 0\n",
	" for x, y in cbow_model(data, window_size, total_vocab):\n",
	" cost += model.train_on_batch(contextual, final_target)\n",
	" print(i, cost)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "ad7fb11d",
	"metadata": {
	"id": "ad7fb11d"
	},
	"outputs": [],
	"source": [
	"dimensions=100\n",
	"vect_file = open('vectors.txt' ,'w')\n",
	"vect_file.write('{} {}\\n'.format(total_vocab,dimensions))\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "7dc257bc",
	"metadata": {
	"id": "7dc257bc"
	},
	"outputs": [],
	"source": [
	"weights = model.get_weights()[0]\n",
	"\n",
	"word2vec = {}\n",
	"\n",
	"for text, i in vectorize.word_index.items():\n",
	"# print(text)\n",
	" word2vec[text] = weights[i, :]\n",
	"# final_vec = ' '.join(map(str, list(weights[i, :])))\n",
	"# vect_file.write('{} {}\\n'.format(text, final_vec))\n",
	"# vect_file.close()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "4c7645a9",
	"metadata": {
	"id": "4c7645a9"
	},
	"outputs": [],
	"source": [
	"eq = word2vec[\"king\"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "95dccf9b",
	"metadata": {
	"id": "95dccf9b",
	"outputId": "01140e77-dc61-425b-9b44-e013e02cba2d",
	"colab": {
	"base_uri": "https://localhost:8080/"
	}
	},
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"[['sovereign', -0.3292683],\n",
	" ['four', -0.2554933],\n",
	" ['princess', -0.24286719],\n",
	" ['model', -0.22898991],\n",
	" ['mesoamerica', -0.21277258]]"
	]
	},
	"metadata": {},
	"execution_count": 40
	}
	],
	"source": [
	"min_dist = 1000\n",
	"min_words = []\n",
	"for word in word2vec:\n",
	" dist = get_cosine_sim(eq, word2vec[word])\n",
	" min_words.append([word, dist])\n",
	"sorted(min_words, key= lambda w:w[1])[:5]"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.10"
	},
	"colab": {
	"provenance": [],
	"name": "Continuous Bag of Words",
	"include_colab_link": true
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}