omairaasim/Gensim_Exercise_30.ipynb

## Gensim_Exercise_30.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "de453699",
   "metadata": {},
   "source": [
    "## Recipe Objective: How to find the similarity of a query document to every document in the corpus?"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6b3fecc9",
   "metadata": {},
   "source": [
    "#### You can do many fun things with the model once you've finished it. For example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "f8840299",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, 0.4690727), (1, 0.072158165), (2, 0.062832855)]\n"
     ]
    }
   ],
   "source": [
    "#importing required libraries\n",
    "from gensim import similarities\n",
    "from gensim import models\n",
    "import gensim\n",
    "from gensim import corpora\n",
    "\n",
    "#creating a sample corpus for demonstration purpose\n",
    "txt_corpus = [\"This is sample document\",\n",
    "             \"Collection of documents make a corpus\",\n",
    "             \"You can vectorize your corpus\"]\n",
    "\n",
    "#creating a set of frequent words\n",
    "stoplist = set('for a of the and to in on of to are at'.split(' '))\n",
    "\n",
    "#lowercasing each document, using white space as delimiter and filtering out the stopwords\n",
    "processed_text = [[word for word in document.lower().split() if word not in stoplist]for document in txt_corpus]\n",
    "\n",
    "#creating a dictionary\n",
    "dictionary = corpora.Dictionary(processed_text)\n",
    "\n",
    "#using doc2bow for vectorization of the entire corpus\n",
    "bow_vec = [dictionary.doc2bow(text) for text in processed_text]\n",
    "\n",
    "#training the model\n",
    "tfidf_model = models.TfidfModel(bow_vec)\n",
    "\n",
    "#indexing\n",
    "index = similarities.SparseMatrixSimilarity(tfidf_model[bow_vec], num_features=12)\n",
    "\n",
    "#finding the similarity of our sample document sample_document against every document in the corpus\n",
    "sample_document = 'sample corpus'.split()\n",
    "sample_bow = dictionary.doc2bow(sample_document)\n",
    "sims = index[tfidf_model[sample_bow]]\n",
    "\n",
    "print(list(enumerate(simi)))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ce3b4342",
   "metadata": {},
   "source": [
    "Document 0 has a similarity score of 0.469~50%, and document 2 has a similarity score of 7%, etc. We can make this more readable by sorting:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "778ce691",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 0.4690727\n",
      "1 0.072158165\n",
      "2 0.062832855\n"
     ]
    }
   ],
   "source": [
    "for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):\n",
    "     print(document_number, score)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8dc4e47b",
   "metadata": {},
   "source": [
    "Document 0 is most similar to the sample document."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b0355b47",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "de453699",
	"metadata": {},
	"source": [
	"## Recipe Objective: How to find the similarity of a query document to every document in the corpus?"
	]
	},
	{
	"cell_type": "markdown",
	"id": "6b3fecc9",
	"metadata": {},
	"source": [
	"#### You can do many fun things with the model once you've finished it. For example"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "f8840299",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[(0, 0.4690727), (1, 0.072158165), (2, 0.062832855)]\n"
	]
	}
	],
	"source": [
	"#importing required libraries\n",
	"from gensim import similarities\n",
	"from gensim import models\n",
	"import gensim\n",
	"from gensim import corpora\n",
	"\n",
	"#creating a sample corpus for demonstration purpose\n",
	"txt_corpus = [\"This is sample document\",\n",
	" \"Collection of documents make a corpus\",\n",
	" \"You can vectorize your corpus\"]\n",
	"\n",
	"#creating a set of frequent words\n",
	"stoplist = set('for a of the and to in on of to are at'.split(' '))\n",
	"\n",
	"#lowercasing each document, using white space as delimiter and filtering out the stopwords\n",
	"processed_text = [[word for word in document.lower().split() if word not in stoplist]for document in txt_corpus]\n",
	"\n",
	"#creating a dictionary\n",
	"dictionary = corpora.Dictionary(processed_text)\n",
	"\n",
	"#using doc2bow for vectorization of the entire corpus\n",
	"bow_vec = [dictionary.doc2bow(text) for text in processed_text]\n",
	"\n",
	"#training the model\n",
	"tfidf_model = models.TfidfModel(bow_vec)\n",
	"\n",
	"#indexing\n",
	"index = similarities.SparseMatrixSimilarity(tfidf_model[bow_vec], num_features=12)\n",
	"\n",
	"#finding the similarity of our sample document sample_document against every document in the corpus\n",
	"sample_document = 'sample corpus'.split()\n",
	"sample_bow = dictionary.doc2bow(sample_document)\n",
	"sims = index[tfidf_model[sample_bow]]\n",
	"\n",
	"print(list(enumerate(simi)))"
	]
	},
	{
	"cell_type": "markdown",
	"id": "ce3b4342",
	"metadata": {},
	"source": [
	"Document 0 has a similarity score of 0.469~50%, and document 2 has a similarity score of 7%, etc. We can make this more readable by sorting:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "778ce691",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0 0.4690727\n",
	"1 0.072158165\n",
	"2 0.062832855\n"
	]
	}
	],
	"source": [
	"for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):\n",
	" print(document_number, score)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "8dc4e47b",
	"metadata": {},
	"source": [
	"Document 0 is most similar to the sample document."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "b0355b47",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}