Skip to content

Instantly share code, notes, and snippets.

@OverPoweredDev
Last active December 14, 2022 05:40
Show Gist options
  • Save OverPoweredDev/fc42001de01a4d5d087c6ba6cf6c18b8 to your computer and use it in GitHub Desktop.
Save OverPoweredDev/fc42001de01a4d5d087c6ba6cf6c18b8 to your computer and use it in GitHub Desktop.
Continuous Bag of Words
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/OverPoweredDev/fc42001de01a4d5d087c6ba6cf6c18b8/cbow.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2b4f5fe3",
"metadata": {
"id": "2b4f5fe3"
},
"outputs": [],
"source": [
"import numpy as np\n",
"import keras.backend as K\n",
"from keras.models import Sequential\n",
"from keras.layers import Dense, Embedding, Lambda\n",
"from keras.utils import np_utils\n",
"from keras.preprocessing import sequence\n",
"from keras.preprocessing.text import Tokenizer\n",
"import gensim\n",
"import nltk"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ed0fd15c",
"metadata": {
"id": "ed0fd15c"
},
"outputs": [],
"source": [
"data=open('text','r')\n",
"corona_data = [text for text in data if text.count(' ') >= 2]\n",
"vectorize = Tokenizer()\n",
"vectorize.fit_on_texts(corona_data)\n",
"corona_data = vectorize.texts_to_sequences(corona_data)\n",
"total_vocab = sum(len(s) for s in corona_data)\n",
"word_count = len(vectorize.word_index) + 1\n",
"window_size = 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8f0eb51a",
"metadata": {
"id": "8f0eb51a"
},
"outputs": [],
"source": [
"def get_cosine_sim(A,B):\n",
" return np.dot(A,B)/(np.linalg.norm(A)*np.linalg.norm(B))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "17cb7fb1",
"metadata": {
"id": "17cb7fb1"
},
"outputs": [],
"source": [
"def cbow_model(data, window_size, total_vocab):\n",
" total_length = window_size*2\n",
" for text in data:\n",
" text_len = len(text)\n",
" for idx, word in enumerate(text):\n",
" context_word = []\n",
" target = [] \n",
" begin = idx - window_size\n",
" end = idx + window_size + 1\n",
" context_word.append([text[i] for i in range(begin, end) if 0 <= i < text_len and i != idx])\n",
" target.append(word)\n",
" contextual = sequence.pad_sequences(context_word, total_length=total_length)\n",
" final_target = np_utils.to_categorical(target, total_vocab)\n",
" yield(contextual, final_target) \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "91502185",
"metadata": {
"id": "91502185"
},
"outputs": [],
"source": [
"model = Sequential()\n",
"model.add(Embedding(input_dim=total_vocab, output_dim=100, input_length=window_size*2))\n",
"model.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(100,)))\n",
"model.add(Dense(total_vocab, activation='softmax'))\n",
"model.compile(loss='categorical_crossentropy', optimizer='adam')\n",
"\n",
"for i in range(10):\n",
" cost = 0\n",
" for x, y in cbow_model(data, window_size, total_vocab):\n",
" cost += model.train_on_batch(contextual, final_target)\n",
" print(i, cost)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ad7fb11d",
"metadata": {
"id": "ad7fb11d"
},
"outputs": [],
"source": [
"dimensions=100\n",
"vect_file = open('vectors.txt' ,'w')\n",
"vect_file.write('{} {}\\n'.format(total_vocab,dimensions))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7dc257bc",
"metadata": {
"id": "7dc257bc"
},
"outputs": [],
"source": [
"weights = model.get_weights()[0]\n",
"\n",
"word2vec = {}\n",
"\n",
"for text, i in vectorize.word_index.items():\n",
"# print(text)\n",
" word2vec[text] = weights[i, :]\n",
"# final_vec = ' '.join(map(str, list(weights[i, :])))\n",
"# vect_file.write('{} {}\\n'.format(text, final_vec))\n",
"# vect_file.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c7645a9",
"metadata": {
"id": "4c7645a9"
},
"outputs": [],
"source": [
"eq = word2vec[\"king\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "95dccf9b",
"metadata": {
"id": "95dccf9b",
"outputId": "01140e77-dc61-425b-9b44-e013e02cba2d",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[['sovereign', -0.3292683],\n",
" ['four', -0.2554933],\n",
" ['princess', -0.24286719],\n",
" ['model', -0.22898991],\n",
" ['mesoamerica', -0.21277258]]"
]
},
"metadata": {},
"execution_count": 40
}
],
"source": [
"min_dist = 1000\n",
"min_words = []\n",
"for word in word2vec:\n",
" dist = get_cosine_sim(eq, word2vec[word])\n",
" min_words.append([word, dist])\n",
"sorted(min_words, key= lambda w:w[1])[:5]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"colab": {
"provenance": [],
"name": "Continuous Bag of Words",
"include_colab_link": true
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment