secsilm/sklearn-tfidf.ipynb

## sklearn-tfidf.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# sklearn 如何计算 TFIDF\n",
    "\n",
    "此 notebook 是博文 [sklearn 如何计算 TFIDF](https://alanlee.fun/2019/08/09/sklearn-tfidf/) 中的代码。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import jieba\n",
    "from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "documents = [\n",
    "    \"低头亲吻我的左手\",\n",
    "    \"换取被宽恕的承诺\",\n",
    "    \"老旧管风琴在角落\",\n",
    "    \"一直一直一直伴奏\",\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\secsi\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 0.909 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    }
   ],
   "source": [
    "documents = [\" \".join(jieba.cut(item)) for item in documents]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['低头 亲吻 我 的 左手', '换取 被 宽恕 的 承诺', '老旧 管风琴 在 角落', '一直 一直 一直 伴奏']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "documents"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 手算\n",
    "\n",
    "以 `一直` 为例"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.916290731874155"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# idf(一直, D)\n",
    "np.log((1+4)/(1+1)) + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5.748872195622465"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# tfidf(一直, 文档 4, D)\n",
    "3 * 1.916290731874155"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 文档 4 未归一化 tfidf 向量\n",
    "no_norm = np.array([5.748872195622465, 0, 1.916290731874155, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.9486833 , 0.        , 0.31622777, 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 文档 4 归一化 tfidf 向量\n",
    "normed = no_norm / np.sqrt(sum(no_norm ** 2))\n",
    "normed"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 使用 sklearn 计算"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 默认情况下 sklearn 会莫名其妙地去除掉一些停用词，即使 stop_words=None \n",
    "# 详细讨论参见 https://github.com/scikit-learn/scikit-learn/issues/10756\n",
    "vectorizer = TfidfVectorizer(token_pattern=r'(?u)\\b\\w+\\b', strip_accents=False)\n",
    "X = vectorizer.fit_transform(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'低头': 3,\n",
       " '亲吻': 1,\n",
       " '我': 7,\n",
       " '的': 10,\n",
       " '左手': 6,\n",
       " '换取': 9,\n",
       " '被': 13,\n",
       " '宽恕': 5,\n",
       " '承诺': 8,\n",
       " '老旧': 12,\n",
       " '管风琴': 11,\n",
       " '在': 4,\n",
       " '角落': 14,\n",
       " '一直': 0,\n",
       " '伴奏': 2}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectorizer.vocabulary_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073,\n",
       "       1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073,\n",
       "       1.51082562, 1.91629073, 1.91629073, 1.91629073, 1.91629073])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectorizer.idf_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.        , 0.46516193, 0.        , 0.46516193, 0.        ,\n",
       "        0.        , 0.46516193, 0.46516193, 0.        , 0.        ,\n",
       "        0.36673901, 0.        , 0.        , 0.        , 0.        ],\n",
       "       [0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "        0.46516193, 0.        , 0.        , 0.46516193, 0.46516193,\n",
       "        0.36673901, 0.        , 0.        , 0.46516193, 0.        ],\n",
       "       [0.        , 0.        , 0.        , 0.        , 0.5       ,\n",
       "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "        0.        , 0.5       , 0.5       , 0.        , 0.5       ],\n",
       "       [0.9486833 , 0.        , 0.31622777, 0.        , 0.        ,\n",
       "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "        0.        , 0.        , 0.        , 0.        , 0.        ]])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "set()"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectorizer.stop_words_"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# sklearn 如何计算 TFIDF\n",
	"\n",
	"此 notebook 是博文 [sklearn 如何计算 TFIDF](https://alanlee.fun/2019/08/09/sklearn-tfidf/) 中的代码。"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import jieba\n",
	"from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"documents = [\n",
	" \"低头亲吻我的左手\",\n",
	" \"换取被宽恕的承诺\",\n",
	" \"老旧管风琴在角落\",\n",
	" \"一直一直一直伴奏\",\n",
	"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"Building prefix dict from the default dictionary ...\n",
	"Loading model from cache C:\\Users\\secsi\\AppData\\Local\\Temp\\jieba.cache\n",
	"Loading model cost 0.909 seconds.\n",
	"Prefix dict has been built succesfully.\n"
	]
	}
	],
	"source": [
	"documents = [\" \".join(jieba.cut(item)) for item in documents]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['低头亲吻我的左手', '换取被宽恕的承诺', '老旧管风琴在角落', '一直一直一直伴奏']"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"documents"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## 手算\n",
	"\n",
	"以 `一直` 为例"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"1.916290731874155"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# idf(一直, D)\n",
	"np.log((1+4)/(1+1)) + 1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"5.748872195622465"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# tfidf(一直, 文档 4, D)\n",
	"3 * 1.916290731874155"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"# 文档 4 未归一化 tfidf 向量\n",
	"no_norm = np.array([5.748872195622465, 0, 1.916290731874155, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([0.9486833 , 0. , 0.31622777, 0. , 0. ,\n",
	" 0. , 0. , 0. , 0. , 0. ,\n",
	" 0. , 0. , 0. , 0. , 0. ])"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# 文档 4 归一化 tfidf 向量\n",
	"normed = no_norm / np.sqrt(sum(no_norm ** 2))\n",
	"normed"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## 使用 sklearn 计算"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [],
	"source": [
	"# 默认情况下 sklearn 会莫名其妙地去除掉一些停用词，即使 stop_words=None \n",
	"# 详细讨论参见 https://github.com/scikit-learn/scikit-learn/issues/10756\n",
	"vectorizer = TfidfVectorizer(token_pattern=r'(?u)\\b\\w+\\b', strip_accents=False)\n",
	"X = vectorizer.fit_transform(documents)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'低头': 3,\n",
	" '亲吻': 1,\n",
	" '我': 7,\n",
	" '的': 10,\n",
	" '左手': 6,\n",
	" '换取': 9,\n",
	" '被': 13,\n",
	" '宽恕': 5,\n",
	" '承诺': 8,\n",
	" '老旧': 12,\n",
	" '管风琴': 11,\n",
	" '在': 4,\n",
	" '角落': 14,\n",
	" '一直': 0,\n",
	" '伴奏': 2}"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"vectorizer.vocabulary_"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073,\n",
	" 1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073,\n",
	" 1.51082562, 1.91629073, 1.91629073, 1.91629073, 1.91629073])"
	]
	},
	"execution_count": 11,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"vectorizer.idf_"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[0. , 0.46516193, 0. , 0.46516193, 0. ,\n",
	" 0. , 0.46516193, 0.46516193, 0. , 0. ,\n",
	" 0.36673901, 0. , 0. , 0. , 0. ],\n",
	" [0. , 0. , 0. , 0. , 0. ,\n",
	" 0.46516193, 0. , 0. , 0.46516193, 0.46516193,\n",
	" 0.36673901, 0. , 0. , 0.46516193, 0. ],\n",
	" [0. , 0. , 0. , 0. , 0.5 ,\n",
	" 0. , 0. , 0. , 0. , 0. ,\n",
	" 0. , 0.5 , 0.5 , 0. , 0.5 ],\n",
	" [0.9486833 , 0. , 0.31622777, 0. , 0. ,\n",
	" 0. , 0. , 0. , 0. , 0. ,\n",
	" 0. , 0. , 0. , 0. , 0. ]])"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"X.toarray()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"set()"
	]
	},
	"execution_count": 13,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"vectorizer.stop_words_"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}