Skip to content

Instantly share code, notes, and snippets.

@arosh
Last active November 4, 2015 13:12
Show Gist options
  • Save arosh/3f0b2be594c45cd6ddc2 to your computer and use it in GitHub Desktop.
Save arosh/3f0b2be594c45cd6ddc2 to your computer and use it in GitHub Desktop.
Document-Term Matrix
LabelBinarizer : ラベル → OneHot表現
LabelEncoder : ラベル → 0, 1, ..., N-1
OneHotEncoder : 0, 1, ..., N-1 → OneHot表現
https://github.com/recruit-tech/summpy/blob/master/summpy/lexrank.py#L34
CountVectorizer : テキスト → document term matrix
TfidfTransformer : document term matrix → IDF重み,l2正規化
TfidfVectorizer : テキスト → IDF重み,l2正規化
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"from __future__ import absolute_import, division, print_function, unicode_literals"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy\n",
"import scipy.sparse\n",
"from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OneHotEncoder\n",
"from sklearn.feature_extraction import DictVectorizer\n",
"from collections import Counter\n",
"import itertools"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"words_list = [\n",
" [\"a\", \"d\", \"b\", \"b\", \"a\"],\n",
" [\"e\", \"c\", \"c\", \"a\"]]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"array([[1, 0, 0, 0, 0],\n",
" [0, 0, 0, 1, 0],\n",
" [0, 1, 0, 0, 0],\n",
" [0, 1, 0, 0, 0],\n",
" [1, 0, 0, 0, 0]])\n"
]
},
{
"data": {
"text/plain": [
"matrix([[2, 2, 0, 1, 0]])"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html\n",
"# sparse_outputのデフォルトはFalse\n",
"label_binarizer = LabelBinarizer(sparse_output=True)\n",
"# fitのXはarray-likeを要求するのでiteratorは不可\n",
"label_binarizer.fit(list(itertools.chain(*words_list)))\n",
"out = label_binarizer.transform(words_list[0])\n",
"# fit_transformの戻り値がscipy.sparse.csr_matrixなのでtoarray()している\n",
"print(repr(out.toarray()))\n",
"out.sum(axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"array([0, 3, 1, 1, 0])\n",
"array([[1, 0, 0, 0, 0],\n",
" [0, 0, 0, 1, 0],\n",
" [0, 1, 0, 0, 0],\n",
" [0, 1, 0, 0, 0],\n",
" [1, 0, 0, 0, 0]])\n"
]
},
{
"data": {
"text/plain": [
"matrix([[2, 2, 0, 1, 0]])"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html\n",
"label_encoder = LabelEncoder()\n",
"label_encoder.fit(list(itertools.chain(*words_list)))\n",
"out = label_encoder.transform(words_list[0])\n",
"print(repr(out))\n",
"# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html\n",
"one_hot_encoder = OneHotEncoder(n_values=len(label_encoder.classes_), dtype=numpy.int64)\n",
"# fitのXはshape=(n_samples, n_feature)\n",
"out2 = one_hot_encoder.fit_transform(out.reshape(-1, 1))\n",
"print(repr(out2.toarray()))\n",
"out2.sum(axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"array([[2, 2, 0, 1, 0],\n",
" [1, 0, 2, 0, 1]])\n",
"<class 'scipy.sparse.csr.csr_matrix'>\n"
]
}
],
"source": [
"# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html\n",
"dict_vectorizer = DictVectorizer(dtype=numpy.int64)\n",
"tf_list = []\n",
"for words in words_list:\n",
" tf_list.append(Counter(words))\n",
"out = dict_vectorizer.fit_transform(tf_list)\n",
"print(repr(out.toarray()))\n",
"print(type(out))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment