iamirmasoud/tf-idf.ipynb

## tf-idf.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "030a8739",
   "metadata": {},
   "source": [
    "# TF-IDF from scratch\n",
    "\n",
    "This is the notebook file for the following post:\n",
    "\n",
    "[Understanding TF-IDF with Python example](http://www.sefidian.com/2022/02/28/understanding-tf-idf-with-python-example)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "80326f4d",
   "metadata": {},
   "source": [
    "# Manual calculation of TF-IDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "85cab12d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8d810df5",
   "metadata": {},
   "outputs": [],
   "source": [
    "corpus = ['data science is one of the most important fields of science',\n",
    "          'this is one of the best data science courses',\n",
    "          'data scientists analyze data' ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "86ec1722",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of words in the corpus: 14\n",
      "The words in the corpus: \n",
      " {'scientists', 'is', 'of', 'data', 'analyze', 'courses', 'one', 'this', 'the', 'fields', 'most', 'science', 'important', 'best'}\n"
     ]
    }
   ],
   "source": [
    "words_set = set()\n",
    " \n",
    "for doc in  corpus:\n",
    "    words = doc.split(' ')\n",
    "    words_set = words_set.union(set(words))\n",
    "     \n",
    "print('Number of words in the corpus:',len(words_set))\n",
    "print('The words in the corpus: \\n', words_set)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eaa017e7",
   "metadata": {},
   "source": [
    "## Computing Term Frequency"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "5ab67f43",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>scientists</th>\n",
       "      <th>is</th>\n",
       "      <th>of</th>\n",
       "      <th>data</th>\n",
       "      <th>analyze</th>\n",
       "      <th>courses</th>\n",
       "      <th>one</th>\n",
       "      <th>this</th>\n",
       "      <th>the</th>\n",
       "      <th>fields</th>\n",
       "      <th>most</th>\n",
       "      <th>science</th>\n",
       "      <th>important</th>\n",
       "      <th>best</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.00</td>\n",
       "      <td>0.090909</td>\n",
       "      <td>0.181818</td>\n",
       "      <td>0.090909</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.090909</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.090909</td>\n",
       "      <td>0.090909</td>\n",
       "      <td>0.090909</td>\n",
       "      <td>0.181818</td>\n",
       "      <td>0.090909</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.00</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.111111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.25</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   scientists        is        of      data  analyze   courses       one  \\\n",
       "0        0.00  0.090909  0.181818  0.090909     0.00  0.000000  0.090909   \n",
       "1        0.00  0.111111  0.111111  0.111111     0.00  0.111111  0.111111   \n",
       "2        0.25  0.000000  0.000000  0.500000     0.25  0.000000  0.000000   \n",
       "\n",
       "       this       the    fields      most   science  important      best  \n",
       "0  0.000000  0.090909  0.090909  0.090909  0.181818   0.090909  0.000000  \n",
       "1  0.111111  0.111111  0.000000  0.000000  0.111111   0.000000  0.111111  \n",
       "2  0.000000  0.000000  0.000000  0.000000  0.000000   0.000000  0.000000  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n_docs = len(corpus)         # Number of documents in the corpus\n",
    "n_words_set = len(words_set) # Number of unique words in the\n",
    " \n",
    "df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=words_set)\n",
    " \n",
    "# Compute Term Frequency (TF)\n",
    "for i in range(n_docs):\n",
    "    words = corpus[i].split(' ') # Words in the document\n",
    "    for w in words:\n",
    "        df_tf[w][i] = df_tf[w][i] + (1 / len(words))\n",
    "         \n",
    "df_tf"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "91e58674",
   "metadata": {},
   "source": [
    "## Computing Inverse Document Frequency"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "9e887ed1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "IDF of: \n",
      "     scientists: 0.47712125471966244\n",
      "             is: 0.17609125905568124\n",
      "             of: 0.17609125905568124\n",
      "           data:        0.0\n",
      "        analyze: 0.47712125471966244\n",
      "        courses: 0.47712125471966244\n",
      "            one: 0.17609125905568124\n",
      "           this: 0.47712125471966244\n",
      "            the: 0.17609125905568124\n",
      "         fields: 0.47712125471966244\n",
      "           most: 0.47712125471966244\n",
      "        science: 0.17609125905568124\n",
      "      important: 0.47712125471966244\n",
      "           best: 0.47712125471966244\n"
     ]
    }
   ],
   "source": [
    "print(\"IDF of: \")\n",
    " \n",
    "idf = {}\n",
    " \n",
    "for w in words_set:\n",
    "    k = 0    # number of documents in the corpus that contain this word\n",
    "     \n",
    "    for i in range(n_docs):\n",
    "        if w in corpus[i].split():\n",
    "            k += 1\n",
    "             \n",
    "    idf[w] =  np.log10(n_docs / k)\n",
    "     \n",
    "    print(f'{w:>15}: {idf[w]:>10}' )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ada1fef3",
   "metadata": {},
   "source": [
    "## Putting it Together: Computing TF-IDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "a9bd2aad",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>scientists</th>\n",
       "      <th>is</th>\n",
       "      <th>of</th>\n",
       "      <th>data</th>\n",
       "      <th>analyze</th>\n",
       "      <th>courses</th>\n",
       "      <th>one</th>\n",
       "      <th>this</th>\n",
       "      <th>the</th>\n",
       "      <th>fields</th>\n",
       "      <th>most</th>\n",
       "      <th>science</th>\n",
       "      <th>important</th>\n",
       "      <th>best</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.016008</td>\n",
       "      <td>0.032017</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.016008</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.016008</td>\n",
       "      <td>0.043375</td>\n",
       "      <td>0.043375</td>\n",
       "      <td>0.032017</td>\n",
       "      <td>0.043375</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.019566</td>\n",
       "      <td>0.019566</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.053013</td>\n",
       "      <td>0.019566</td>\n",
       "      <td>0.053013</td>\n",
       "      <td>0.019566</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.019566</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.053013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.11928</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.11928</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   scientists        is        of  data  analyze   courses       one  \\\n",
       "0     0.00000  0.016008  0.032017   0.0  0.00000  0.000000  0.016008   \n",
       "1     0.00000  0.019566  0.019566   0.0  0.00000  0.053013  0.019566   \n",
       "2     0.11928  0.000000  0.000000   0.0  0.11928  0.000000  0.000000   \n",
       "\n",
       "       this       the    fields      most   science  important      best  \n",
       "0  0.000000  0.016008  0.043375  0.043375  0.032017   0.043375  0.000000  \n",
       "1  0.053013  0.019566  0.000000  0.000000  0.019566   0.000000  0.053013  \n",
       "2  0.000000  0.000000  0.000000  0.000000  0.000000   0.000000  0.000000  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_tf_idf = df_tf.copy()\n",
    " \n",
    "for w in words_set:\n",
    "    for i in range(n_docs):\n",
    "        df_tf_idf[w][i] = df_tf[w][i] * idf[w]\n",
    "         \n",
    "df_tf_idf"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "062e791b",
   "metadata": {},
   "source": [
    "# TF-IDF Using scikit-learn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "382c5321",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "35061ba7",
   "metadata": {},
   "outputs": [],
   "source": [
    "tr_idf_model  = TfidfVectorizer()\n",
    "tf_idf_vector = tr_idf_model.fit_transform(corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "7780772d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'scipy.sparse.csr.csr_matrix'> (3, 14)\n"
     ]
    }
   ],
   "source": [
    "print(type(tf_idf_vector), tf_idf_vector.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "bc69a9d9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0.         0.         0.         0.18952581 0.32089509 0.32089509\n",
      "  0.24404899 0.32089509 0.48809797 0.24404899 0.48809797 0.\n",
      "  0.24404899 0.        ]\n",
      " [0.         0.40029393 0.40029393 0.23642005 0.         0.\n",
      "  0.30443385 0.         0.30443385 0.30443385 0.30443385 0.\n",
      "  0.30443385 0.40029393]\n",
      " [0.54270061 0.         0.         0.64105545 0.         0.\n",
      "  0.         0.         0.         0.         0.         0.54270061\n",
      "  0.         0.        ]]\n"
     ]
    }
   ],
   "source": [
    "tf_idf_array = tf_idf_vector.toarray()\n",
    " \n",
    "print(tf_idf_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "862dbaf1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['analyze' 'best' 'courses' 'data' 'fields' 'important' 'is' 'most' 'of'\n",
      " 'one' 'science' 'scientists' 'the' 'this']\n"
     ]
    }
   ],
   "source": [
    "words_set = tr_idf_model.get_feature_names_out()\n",
    " \n",
    "print(words_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "838bf0d7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>analyze</th>\n",
       "      <th>best</th>\n",
       "      <th>courses</th>\n",
       "      <th>data</th>\n",
       "      <th>fields</th>\n",
       "      <th>important</th>\n",
       "      <th>is</th>\n",
       "      <th>most</th>\n",
       "      <th>of</th>\n",
       "      <th>one</th>\n",
       "      <th>science</th>\n",
       "      <th>scientists</th>\n",
       "      <th>the</th>\n",
       "      <th>this</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.189526</td>\n",
       "      <td>0.320895</td>\n",
       "      <td>0.320895</td>\n",
       "      <td>0.244049</td>\n",
       "      <td>0.320895</td>\n",
       "      <td>0.488098</td>\n",
       "      <td>0.244049</td>\n",
       "      <td>0.488098</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.244049</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.400294</td>\n",
       "      <td>0.400294</td>\n",
       "      <td>0.236420</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.304434</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.304434</td>\n",
       "      <td>0.304434</td>\n",
       "      <td>0.304434</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.304434</td>\n",
       "      <td>0.400294</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.542701</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.641055</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.542701</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    analyze      best   courses      data    fields  important        is  \\\n",
       "0  0.000000  0.000000  0.000000  0.189526  0.320895   0.320895  0.244049   \n",
       "1  0.000000  0.400294  0.400294  0.236420  0.000000   0.000000  0.304434   \n",
       "2  0.542701  0.000000  0.000000  0.641055  0.000000   0.000000  0.000000   \n",
       "\n",
       "       most        of       one   science  scientists       the      this  \n",
       "0  0.320895  0.488098  0.244049  0.488098    0.000000  0.244049  0.000000  \n",
       "1  0.000000  0.304434  0.304434  0.304434    0.000000  0.304434  0.400294  \n",
       "2  0.000000  0.000000  0.000000  0.000000    0.542701  0.000000  0.000000  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_tf_idf = pd.DataFrame(tf_idf_array, columns = words_set)\n",
    " \n",
    "df_tf_idf"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "030a8739",
	"metadata": {},
	"source": [
	"# TF-IDF from scratch\n",
	"\n",
	"This is the notebook file for the following post:\n",
	"\n",
	"[Understanding TF-IDF with Python example](http://www.sefidian.com/2022/02/28/understanding-tf-idf-with-python-example)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "80326f4d",
	"metadata": {},
	"source": [
	"# Manual calculation of TF-IDF"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "85cab12d",
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "8d810df5",
	"metadata": {},
	"outputs": [],
	"source": [
	"corpus = ['data science is one of the most important fields of science',\n",
	" 'this is one of the best data science courses',\n",
	" 'data scientists analyze data' ]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "86ec1722",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Number of words in the corpus: 14\n",
	"The words in the corpus: \n",
	" {'scientists', 'is', 'of', 'data', 'analyze', 'courses', 'one', 'this', 'the', 'fields', 'most', 'science', 'important', 'best'}\n"
	]
	}
	],
	"source": [
	"words_set = set()\n",
	" \n",
	"for doc in corpus:\n",
	" words = doc.split(' ')\n",
	" words_set = words_set.union(set(words))\n",
	" \n",
	"print('Number of words in the corpus:',len(words_set))\n",
	"print('The words in the corpus: \\n', words_set)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "eaa017e7",
	"metadata": {},
	"source": [
	"## Computing Term Frequency"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "5ab67f43",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>scientists</th>\n",
	" <th>is</th>\n",
	" <th>of</th>\n",
	" <th>data</th>\n",
	" <th>analyze</th>\n",
	" <th>courses</th>\n",
	" <th>one</th>\n",
	" <th>this</th>\n",
	" <th>the</th>\n",
	" <th>fields</th>\n",
	" <th>most</th>\n",
	" <th>science</th>\n",
	" <th>important</th>\n",
	" <th>best</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>0.00</td>\n",
	" <td>0.090909</td>\n",
	" <td>0.181818</td>\n",
	" <td>0.090909</td>\n",
	" <td>0.00</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.090909</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.090909</td>\n",
	" <td>0.090909</td>\n",
	" <td>0.090909</td>\n",
	" <td>0.181818</td>\n",
	" <td>0.090909</td>\n",
	" <td>0.000000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>0.00</td>\n",
	" <td>0.111111</td>\n",
	" <td>0.111111</td>\n",
	" <td>0.111111</td>\n",
	" <td>0.00</td>\n",
	" <td>0.111111</td>\n",
	" <td>0.111111</td>\n",
	" <td>0.111111</td>\n",
	" <td>0.111111</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.111111</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.111111</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>0.25</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.500000</td>\n",
	" <td>0.25</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" scientists is of data analyze courses one \\\n",
	"0 0.00 0.090909 0.181818 0.090909 0.00 0.000000 0.090909 \n",
	"1 0.00 0.111111 0.111111 0.111111 0.00 0.111111 0.111111 \n",
	"2 0.25 0.000000 0.000000 0.500000 0.25 0.000000 0.000000 \n",
	"\n",
	" this the fields most science important best \n",
	"0 0.000000 0.090909 0.090909 0.090909 0.181818 0.090909 0.000000 \n",
	"1 0.111111 0.111111 0.000000 0.000000 0.111111 0.000000 0.111111 \n",
	"2 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 "
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"n_docs = len(corpus) # Number of documents in the corpus\n",
	"n_words_set = len(words_set) # Number of unique words in the\n",
	" \n",
	"df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=words_set)\n",
	" \n",
	"# Compute Term Frequency (TF)\n",
	"for i in range(n_docs):\n",
	" words = corpus[i].split(' ') # Words in the document\n",
	" for w in words:\n",
	" df_tf[w][i] = df_tf[w][i] + (1 / len(words))\n",
	" \n",
	"df_tf"
	]
	},
	{
	"cell_type": "markdown",
	"id": "91e58674",
	"metadata": {},
	"source": [
	"## Computing Inverse Document Frequency"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "9e887ed1",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"IDF of: \n",
	" scientists: 0.47712125471966244\n",
	" is: 0.17609125905568124\n",
	" of: 0.17609125905568124\n",
	" data: 0.0\n",
	" analyze: 0.47712125471966244\n",
	" courses: 0.47712125471966244\n",
	" one: 0.17609125905568124\n",
	" this: 0.47712125471966244\n",
	" the: 0.17609125905568124\n",
	" fields: 0.47712125471966244\n",
	" most: 0.47712125471966244\n",
	" science: 0.17609125905568124\n",
	" important: 0.47712125471966244\n",
	" best: 0.47712125471966244\n"
	]
	}
	],
	"source": [
	"print(\"IDF of: \")\n",
	" \n",
	"idf = {}\n",
	" \n",
	"for w in words_set:\n",
	" k = 0 # number of documents in the corpus that contain this word\n",
	" \n",
	" for i in range(n_docs):\n",
	" if w in corpus[i].split():\n",
	" k += 1\n",
	" \n",
	" idf[w] = np.log10(n_docs / k)\n",
	" \n",
	" print(f'{w:>15}: {idf[w]:>10}' )"
	]
	},
	{
	"cell_type": "markdown",
	"id": "ada1fef3",
	"metadata": {},
	"source": [
	"## Putting it Together: Computing TF-IDF"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "a9bd2aad",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>scientists</th>\n",
	" <th>is</th>\n",
	" <th>of</th>\n",
	" <th>data</th>\n",
	" <th>analyze</th>\n",
	" <th>courses</th>\n",
	" <th>one</th>\n",
	" <th>this</th>\n",
	" <th>the</th>\n",
	" <th>fields</th>\n",
	" <th>most</th>\n",
	" <th>science</th>\n",
	" <th>important</th>\n",
	" <th>best</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>0.00000</td>\n",
	" <td>0.016008</td>\n",
	" <td>0.032017</td>\n",
	" <td>0.0</td>\n",
	" <td>0.00000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.016008</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.016008</td>\n",
	" <td>0.043375</td>\n",
	" <td>0.043375</td>\n",
	" <td>0.032017</td>\n",
	" <td>0.043375</td>\n",
	" <td>0.000000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>0.00000</td>\n",
	" <td>0.019566</td>\n",
	" <td>0.019566</td>\n",
	" <td>0.0</td>\n",
	" <td>0.00000</td>\n",
	" <td>0.053013</td>\n",
	" <td>0.019566</td>\n",
	" <td>0.053013</td>\n",
	" <td>0.019566</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.019566</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.053013</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>0.11928</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.0</td>\n",
	" <td>0.11928</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" scientists is of data analyze courses one \\\n",
	"0 0.00000 0.016008 0.032017 0.0 0.00000 0.000000 0.016008 \n",
	"1 0.00000 0.019566 0.019566 0.0 0.00000 0.053013 0.019566 \n",
	"2 0.11928 0.000000 0.000000 0.0 0.11928 0.000000 0.000000 \n",
	"\n",
	" this the fields most science important best \n",
	"0 0.000000 0.016008 0.043375 0.043375 0.032017 0.043375 0.000000 \n",
	"1 0.053013 0.019566 0.000000 0.000000 0.019566 0.000000 0.053013 \n",
	"2 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 "
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df_tf_idf = df_tf.copy()\n",
	" \n",
	"for w in words_set:\n",
	" for i in range(n_docs):\n",
	" df_tf_idf[w][i] = df_tf[w][i] * idf[w]\n",
	" \n",
	"df_tf_idf"
	]
	},
	{
	"cell_type": "markdown",
	"id": "062e791b",
	"metadata": {},
	"source": [
	"# TF-IDF Using scikit-learn"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"id": "382c5321",
	"metadata": {},
	"outputs": [],
	"source": [
	"from sklearn.feature_extraction.text import TfidfVectorizer"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"id": "35061ba7",
	"metadata": {},
	"outputs": [],
	"source": [
	"tr_idf_model = TfidfVectorizer()\n",
	"tf_idf_vector = tr_idf_model.fit_transform(corpus)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"id": "7780772d",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"<class 'scipy.sparse.csr.csr_matrix'> (3, 14)\n"
	]
	}
	],
	"source": [
	"print(type(tf_idf_vector), tf_idf_vector.shape)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"id": "bc69a9d9",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[[0. 0. 0. 0.18952581 0.32089509 0.32089509\n",
	" 0.24404899 0.32089509 0.48809797 0.24404899 0.48809797 0.\n",
	" 0.24404899 0. ]\n",
	" [0. 0.40029393 0.40029393 0.23642005 0. 0.\n",
	" 0.30443385 0. 0.30443385 0.30443385 0.30443385 0.\n",
	" 0.30443385 0.40029393]\n",
	" [0.54270061 0. 0. 0.64105545 0. 0.\n",
	" 0. 0. 0. 0. 0. 0.54270061\n",
	" 0. 0. ]]\n"
	]
	}
	],
	"source": [
	"tf_idf_array = tf_idf_vector.toarray()\n",
	" \n",
	"print(tf_idf_array)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"id": "862dbaf1",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"['analyze' 'best' 'courses' 'data' 'fields' 'important' 'is' 'most' 'of'\n",
	" 'one' 'science' 'scientists' 'the' 'this']\n"
	]
	}
	],
	"source": [
	"words_set = tr_idf_model.get_feature_names_out()\n",
	" \n",
	"print(words_set)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"id": "838bf0d7",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>analyze</th>\n",
	" <th>best</th>\n",
	" <th>courses</th>\n",
	" <th>data</th>\n",
	" <th>fields</th>\n",
	" <th>important</th>\n",
	" <th>is</th>\n",
	" <th>most</th>\n",
	" <th>of</th>\n",
	" <th>one</th>\n",
	" <th>science</th>\n",
	" <th>scientists</th>\n",
	" <th>the</th>\n",
	" <th>this</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.189526</td>\n",
	" <td>0.320895</td>\n",
	" <td>0.320895</td>\n",
	" <td>0.244049</td>\n",
	" <td>0.320895</td>\n",
	" <td>0.488098</td>\n",
	" <td>0.244049</td>\n",
	" <td>0.488098</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.244049</td>\n",
	" <td>0.000000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>0.000000</td>\n",
	" <td>0.400294</td>\n",
	" <td>0.400294</td>\n",
	" <td>0.236420</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.304434</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.304434</td>\n",
	" <td>0.304434</td>\n",
	" <td>0.304434</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.304434</td>\n",
	" <td>0.400294</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>0.542701</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.641055</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.542701</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" analyze best courses data fields important is \\\n",
	"0 0.000000 0.000000 0.000000 0.189526 0.320895 0.320895 0.244049 \n",
	"1 0.000000 0.400294 0.400294 0.236420 0.000000 0.000000 0.304434 \n",
	"2 0.542701 0.000000 0.000000 0.641055 0.000000 0.000000 0.000000 \n",
	"\n",
	" most of one science scientists the this \n",
	"0 0.320895 0.488098 0.244049 0.488098 0.000000 0.244049 0.000000 \n",
	"1 0.000000 0.304434 0.304434 0.304434 0.000000 0.304434 0.400294 \n",
	"2 0.000000 0.000000 0.000000 0.000000 0.542701 0.000000 0.000000 "
	]
	},
	"execution_count": 15,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df_tf_idf = pd.DataFrame(tf_idf_array, columns = words_set)\n",
	" \n",
	"df_tf_idf"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}