Last active
November 4, 2015 13:12
-
-
Save arosh/3f0b2be594c45cd6ddc2 to your computer and use it in GitHub Desktop.
Document-Term Matrix
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
LabelBinarizer : ラベル → OneHot表現 | |
LabelEncoder : ラベル → 0, 1, ..., N-1 | |
OneHotEncoder : 0, 1, ..., N-1 → OneHot表現 | |
https://github.com/recruit-tech/summpy/blob/master/summpy/lexrank.py#L34 | |
CountVectorizer : テキスト → document term matrix | |
TfidfTransformer : document term matrix → IDF重み,l2正規化 | |
TfidfVectorizer : テキスト → IDF重み,l2正規化 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"%matplotlib inline\n", | |
"from __future__ import absolute_import, division, print_function, unicode_literals" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import numpy\n", | |
"import scipy.sparse\n", | |
"from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OneHotEncoder\n", | |
"from sklearn.feature_extraction import DictVectorizer\n", | |
"from collections import Counter\n", | |
"import itertools" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"words_list = [\n", | |
" [\"a\", \"d\", \"b\", \"b\", \"a\"],\n", | |
" [\"e\", \"c\", \"c\", \"a\"]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"array([[1, 0, 0, 0, 0],\n", | |
" [0, 0, 0, 1, 0],\n", | |
" [0, 1, 0, 0, 0],\n", | |
" [0, 1, 0, 0, 0],\n", | |
" [1, 0, 0, 0, 0]])\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"matrix([[2, 2, 0, 1, 0]])" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html\n", | |
"# sparse_outputのデフォルトはFalse\n", | |
"label_binarizer = LabelBinarizer(sparse_output=True)\n", | |
"# fitのXはarray-likeを要求するのでiteratorは不可\n", | |
"label_binarizer.fit(list(itertools.chain(*words_list)))\n", | |
"out = label_binarizer.transform(words_list[0])\n", | |
"# fit_transformの戻り値がscipy.sparse.csr_matrixなのでtoarray()している\n", | |
"print(repr(out.toarray()))\n", | |
"out.sum(axis=0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"array([0, 3, 1, 1, 0])\n", | |
"array([[1, 0, 0, 0, 0],\n", | |
" [0, 0, 0, 1, 0],\n", | |
" [0, 1, 0, 0, 0],\n", | |
" [0, 1, 0, 0, 0],\n", | |
" [1, 0, 0, 0, 0]])\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"matrix([[2, 2, 0, 1, 0]])" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html\n", | |
"label_encoder = LabelEncoder()\n", | |
"label_encoder.fit(list(itertools.chain(*words_list)))\n", | |
"out = label_encoder.transform(words_list[0])\n", | |
"print(repr(out))\n", | |
"# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html\n", | |
"one_hot_encoder = OneHotEncoder(n_values=len(label_encoder.classes_), dtype=numpy.int64)\n", | |
"# fitのXはshape=(n_samples, n_feature)\n", | |
"out2 = one_hot_encoder.fit_transform(out.reshape(-1, 1))\n", | |
"print(repr(out2.toarray()))\n", | |
"out2.sum(axis=0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"array([[2, 2, 0, 1, 0],\n", | |
" [1, 0, 2, 0, 1]])\n", | |
"<class 'scipy.sparse.csr.csr_matrix'>\n" | |
] | |
} | |
], | |
"source": [ | |
"# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html\n", | |
"dict_vectorizer = DictVectorizer(dtype=numpy.int64)\n", | |
"tf_list = []\n", | |
"for words in words_list:\n", | |
" tf_list.append(Counter(words))\n", | |
"out = dict_vectorizer.fit_transform(tf_list)\n", | |
"print(repr(out.toarray()))\n", | |
"print(type(out))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment