Skip to content

Instantly share code, notes, and snippets.

@iamirmasoud
Created March 20, 2023 09:54
Show Gist options
  • Save iamirmasoud/fa11ceae3503ce175b374bb3d5dc0ea9 to your computer and use it in GitHub Desktop.
Save iamirmasoud/fa11ceae3503ce175b374bb3d5dc0ea9 to your computer and use it in GitHub Desktop.
Understanding TF-IDF with Python example
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "030a8739",
"metadata": {},
"source": [
"# TF-IDF from scratch\n",
"\n",
"This is the notebook file for the following post:\n",
"\n",
"[Understanding TF-IDF with Python example](http://www.sefidian.com/2022/02/28/understanding-tf-idf-with-python-example)"
]
},
{
"cell_type": "markdown",
"id": "80326f4d",
"metadata": {},
"source": [
"# Manual calculation of TF-IDF"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "85cab12d",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "8d810df5",
"metadata": {},
"outputs": [],
"source": [
"corpus = ['data science is one of the most important fields of science',\n",
" 'this is one of the best data science courses',\n",
" 'data scientists analyze data' ]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "86ec1722",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of words in the corpus: 14\n",
"The words in the corpus: \n",
" {'scientists', 'is', 'of', 'data', 'analyze', 'courses', 'one', 'this', 'the', 'fields', 'most', 'science', 'important', 'best'}\n"
]
}
],
"source": [
"words_set = set()\n",
" \n",
"for doc in corpus:\n",
" words = doc.split(' ')\n",
" words_set = words_set.union(set(words))\n",
" \n",
"print('Number of words in the corpus:',len(words_set))\n",
"print('The words in the corpus: \\n', words_set)"
]
},
{
"cell_type": "markdown",
"id": "eaa017e7",
"metadata": {},
"source": [
"## Computing Term Frequency"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "5ab67f43",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>scientists</th>\n",
" <th>is</th>\n",
" <th>of</th>\n",
" <th>data</th>\n",
" <th>analyze</th>\n",
" <th>courses</th>\n",
" <th>one</th>\n",
" <th>this</th>\n",
" <th>the</th>\n",
" <th>fields</th>\n",
" <th>most</th>\n",
" <th>science</th>\n",
" <th>important</th>\n",
" <th>best</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.00</td>\n",
" <td>0.090909</td>\n",
" <td>0.181818</td>\n",
" <td>0.090909</td>\n",
" <td>0.00</td>\n",
" <td>0.000000</td>\n",
" <td>0.090909</td>\n",
" <td>0.000000</td>\n",
" <td>0.090909</td>\n",
" <td>0.090909</td>\n",
" <td>0.090909</td>\n",
" <td>0.181818</td>\n",
" <td>0.090909</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.00</td>\n",
" <td>0.111111</td>\n",
" <td>0.111111</td>\n",
" <td>0.111111</td>\n",
" <td>0.00</td>\n",
" <td>0.111111</td>\n",
" <td>0.111111</td>\n",
" <td>0.111111</td>\n",
" <td>0.111111</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.111111</td>\n",
" <td>0.000000</td>\n",
" <td>0.111111</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.25</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.500000</td>\n",
" <td>0.25</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" scientists is of data analyze courses one \\\n",
"0 0.00 0.090909 0.181818 0.090909 0.00 0.000000 0.090909 \n",
"1 0.00 0.111111 0.111111 0.111111 0.00 0.111111 0.111111 \n",
"2 0.25 0.000000 0.000000 0.500000 0.25 0.000000 0.000000 \n",
"\n",
" this the fields most science important best \n",
"0 0.000000 0.090909 0.090909 0.090909 0.181818 0.090909 0.000000 \n",
"1 0.111111 0.111111 0.000000 0.000000 0.111111 0.000000 0.111111 \n",
"2 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"n_docs = len(corpus) # Number of documents in the corpus\n",
"n_words_set = len(words_set) # Number of unique words in the\n",
" \n",
"df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=words_set)\n",
" \n",
"# Compute Term Frequency (TF)\n",
"for i in range(n_docs):\n",
" words = corpus[i].split(' ') # Words in the document\n",
" for w in words:\n",
" df_tf[w][i] = df_tf[w][i] + (1 / len(words))\n",
" \n",
"df_tf"
]
},
{
"cell_type": "markdown",
"id": "91e58674",
"metadata": {},
"source": [
"## Computing Inverse Document Frequency"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "9e887ed1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"IDF of: \n",
" scientists: 0.47712125471966244\n",
" is: 0.17609125905568124\n",
" of: 0.17609125905568124\n",
" data: 0.0\n",
" analyze: 0.47712125471966244\n",
" courses: 0.47712125471966244\n",
" one: 0.17609125905568124\n",
" this: 0.47712125471966244\n",
" the: 0.17609125905568124\n",
" fields: 0.47712125471966244\n",
" most: 0.47712125471966244\n",
" science: 0.17609125905568124\n",
" important: 0.47712125471966244\n",
" best: 0.47712125471966244\n"
]
}
],
"source": [
"print(\"IDF of: \")\n",
" \n",
"idf = {}\n",
" \n",
"for w in words_set:\n",
" k = 0 # number of documents in the corpus that contain this word\n",
" \n",
" for i in range(n_docs):\n",
" if w in corpus[i].split():\n",
" k += 1\n",
" \n",
" idf[w] = np.log10(n_docs / k)\n",
" \n",
" print(f'{w:>15}: {idf[w]:>10}' )"
]
},
{
"cell_type": "markdown",
"id": "ada1fef3",
"metadata": {},
"source": [
"## Putting it Together: Computing TF-IDF"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "a9bd2aad",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>scientists</th>\n",
" <th>is</th>\n",
" <th>of</th>\n",
" <th>data</th>\n",
" <th>analyze</th>\n",
" <th>courses</th>\n",
" <th>one</th>\n",
" <th>this</th>\n",
" <th>the</th>\n",
" <th>fields</th>\n",
" <th>most</th>\n",
" <th>science</th>\n",
" <th>important</th>\n",
" <th>best</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.00000</td>\n",
" <td>0.016008</td>\n",
" <td>0.032017</td>\n",
" <td>0.0</td>\n",
" <td>0.00000</td>\n",
" <td>0.000000</td>\n",
" <td>0.016008</td>\n",
" <td>0.000000</td>\n",
" <td>0.016008</td>\n",
" <td>0.043375</td>\n",
" <td>0.043375</td>\n",
" <td>0.032017</td>\n",
" <td>0.043375</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.00000</td>\n",
" <td>0.019566</td>\n",
" <td>0.019566</td>\n",
" <td>0.0</td>\n",
" <td>0.00000</td>\n",
" <td>0.053013</td>\n",
" <td>0.019566</td>\n",
" <td>0.053013</td>\n",
" <td>0.019566</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.019566</td>\n",
" <td>0.000000</td>\n",
" <td>0.053013</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.11928</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.11928</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" scientists is of data analyze courses one \\\n",
"0 0.00000 0.016008 0.032017 0.0 0.00000 0.000000 0.016008 \n",
"1 0.00000 0.019566 0.019566 0.0 0.00000 0.053013 0.019566 \n",
"2 0.11928 0.000000 0.000000 0.0 0.11928 0.000000 0.000000 \n",
"\n",
" this the fields most science important best \n",
"0 0.000000 0.016008 0.043375 0.043375 0.032017 0.043375 0.000000 \n",
"1 0.053013 0.019566 0.000000 0.000000 0.019566 0.000000 0.053013 \n",
"2 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_tf_idf = df_tf.copy()\n",
" \n",
"for w in words_set:\n",
" for i in range(n_docs):\n",
" df_tf_idf[w][i] = df_tf[w][i] * idf[w]\n",
" \n",
"df_tf_idf"
]
},
{
"cell_type": "markdown",
"id": "062e791b",
"metadata": {},
"source": [
"# TF-IDF Using scikit-learn"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "382c5321",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "35061ba7",
"metadata": {},
"outputs": [],
"source": [
"tr_idf_model = TfidfVectorizer()\n",
"tf_idf_vector = tr_idf_model.fit_transform(corpus)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "7780772d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'scipy.sparse.csr.csr_matrix'> (3, 14)\n"
]
}
],
"source": [
"print(type(tf_idf_vector), tf_idf_vector.shape)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "bc69a9d9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[0. 0. 0. 0.18952581 0.32089509 0.32089509\n",
" 0.24404899 0.32089509 0.48809797 0.24404899 0.48809797 0.\n",
" 0.24404899 0. ]\n",
" [0. 0.40029393 0.40029393 0.23642005 0. 0.\n",
" 0.30443385 0. 0.30443385 0.30443385 0.30443385 0.\n",
" 0.30443385 0.40029393]\n",
" [0.54270061 0. 0. 0.64105545 0. 0.\n",
" 0. 0. 0. 0. 0. 0.54270061\n",
" 0. 0. ]]\n"
]
}
],
"source": [
"tf_idf_array = tf_idf_vector.toarray()\n",
" \n",
"print(tf_idf_array)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "862dbaf1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['analyze' 'best' 'courses' 'data' 'fields' 'important' 'is' 'most' 'of'\n",
" 'one' 'science' 'scientists' 'the' 'this']\n"
]
}
],
"source": [
"words_set = tr_idf_model.get_feature_names_out()\n",
" \n",
"print(words_set)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "838bf0d7",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>analyze</th>\n",
" <th>best</th>\n",
" <th>courses</th>\n",
" <th>data</th>\n",
" <th>fields</th>\n",
" <th>important</th>\n",
" <th>is</th>\n",
" <th>most</th>\n",
" <th>of</th>\n",
" <th>one</th>\n",
" <th>science</th>\n",
" <th>scientists</th>\n",
" <th>the</th>\n",
" <th>this</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.189526</td>\n",
" <td>0.320895</td>\n",
" <td>0.320895</td>\n",
" <td>0.244049</td>\n",
" <td>0.320895</td>\n",
" <td>0.488098</td>\n",
" <td>0.244049</td>\n",
" <td>0.488098</td>\n",
" <td>0.000000</td>\n",
" <td>0.244049</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.000000</td>\n",
" <td>0.400294</td>\n",
" <td>0.400294</td>\n",
" <td>0.236420</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.304434</td>\n",
" <td>0.000000</td>\n",
" <td>0.304434</td>\n",
" <td>0.304434</td>\n",
" <td>0.304434</td>\n",
" <td>0.000000</td>\n",
" <td>0.304434</td>\n",
" <td>0.400294</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.542701</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.641055</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.542701</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" analyze best courses data fields important is \\\n",
"0 0.000000 0.000000 0.000000 0.189526 0.320895 0.320895 0.244049 \n",
"1 0.000000 0.400294 0.400294 0.236420 0.000000 0.000000 0.304434 \n",
"2 0.542701 0.000000 0.000000 0.641055 0.000000 0.000000 0.000000 \n",
"\n",
" most of one science scientists the this \n",
"0 0.320895 0.488098 0.244049 0.488098 0.000000 0.244049 0.000000 \n",
"1 0.000000 0.304434 0.304434 0.304434 0.000000 0.304434 0.400294 \n",
"2 0.000000 0.000000 0.000000 0.000000 0.542701 0.000000 0.000000 "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_tf_idf = pd.DataFrame(tf_idf_array, columns = words_set)\n",
" \n",
"df_tf_idf"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment