Last active
June 16, 2020 06:38
-
-
Save secsilm/79fef46e3defa33e7644d59f2401aa34 to your computer and use it in GitHub Desktop.
sklearn 如何计算 TFIDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# sklearn 如何计算 TFIDF\n", | |
"\n", | |
"此 notebook 是博文 [sklearn 如何计算 TFIDF](https://alanlee.fun/2019/08/09/sklearn-tfidf/) 中的代码。" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import jieba\n", | |
"from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"documents = [\n", | |
" \"低头亲吻我的左手\",\n", | |
" \"换取被宽恕的承诺\",\n", | |
" \"老旧管风琴在角落\",\n", | |
" \"一直一直一直伴奏\",\n", | |
"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"Building prefix dict from the default dictionary ...\n", | |
"Loading model from cache C:\\Users\\secsi\\AppData\\Local\\Temp\\jieba.cache\n", | |
"Loading model cost 0.909 seconds.\n", | |
"Prefix dict has been built succesfully.\n" | |
] | |
} | |
], | |
"source": [ | |
"documents = [\" \".join(jieba.cut(item)) for item in documents]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['低头 亲吻 我 的 左手', '换取 被 宽恕 的 承诺', '老旧 管风琴 在 角落', '一直 一直 一直 伴奏']" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"documents" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## 手算\n", | |
"\n", | |
"以 `一直` 为例" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1.916290731874155" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# idf(一直, D)\n", | |
"np.log((1+4)/(1+1)) + 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"5.748872195622465" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# tfidf(一直, 文档 4, D)\n", | |
"3 * 1.916290731874155" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# 文档 4 未归一化 tfidf 向量\n", | |
"no_norm = np.array([5.748872195622465, 0, 1.916290731874155, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([0.9486833 , 0. , 0.31622777, 0. , 0. ,\n", | |
" 0. , 0. , 0. , 0. , 0. ,\n", | |
" 0. , 0. , 0. , 0. , 0. ])" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# 文档 4 归一化 tfidf 向量\n", | |
"normed = no_norm / np.sqrt(sum(no_norm ** 2))\n", | |
"normed" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## 使用 sklearn 计算" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# 默认情况下 sklearn 会莫名其妙地去除掉一些停用词,即使 stop_words=None \n", | |
"# 详细讨论参见 https://github.com/scikit-learn/scikit-learn/issues/10756\n", | |
"vectorizer = TfidfVectorizer(token_pattern=r'(?u)\\b\\w+\\b', strip_accents=False)\n", | |
"X = vectorizer.fit_transform(documents)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'低头': 3,\n", | |
" '亲吻': 1,\n", | |
" '我': 7,\n", | |
" '的': 10,\n", | |
" '左手': 6,\n", | |
" '换取': 9,\n", | |
" '被': 13,\n", | |
" '宽恕': 5,\n", | |
" '承诺': 8,\n", | |
" '老旧': 12,\n", | |
" '管风琴': 11,\n", | |
" '在': 4,\n", | |
" '角落': 14,\n", | |
" '一直': 0,\n", | |
" '伴奏': 2}" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"vectorizer.vocabulary_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073,\n", | |
" 1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073,\n", | |
" 1.51082562, 1.91629073, 1.91629073, 1.91629073, 1.91629073])" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"vectorizer.idf_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[0. , 0.46516193, 0. , 0.46516193, 0. ,\n", | |
" 0. , 0.46516193, 0.46516193, 0. , 0. ,\n", | |
" 0.36673901, 0. , 0. , 0. , 0. ],\n", | |
" [0. , 0. , 0. , 0. , 0. ,\n", | |
" 0.46516193, 0. , 0. , 0.46516193, 0.46516193,\n", | |
" 0.36673901, 0. , 0. , 0.46516193, 0. ],\n", | |
" [0. , 0. , 0. , 0. , 0.5 ,\n", | |
" 0. , 0. , 0. , 0. , 0. ,\n", | |
" 0. , 0.5 , 0.5 , 0. , 0.5 ],\n", | |
" [0.9486833 , 0. , 0.31622777, 0. , 0. ,\n", | |
" 0. , 0. , 0. , 0. , 0. ,\n", | |
" 0. , 0. , 0. , 0. , 0. ]])" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"X.toarray()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"set()" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"vectorizer.stop_words_" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment