Last active
September 22, 2017 01:39
-
-
Save koorukuroo/19533f029846117888dd418712d10621 to your computer and use it in GitHub Desktop.
sklearn CountVectorizer fit_transform TEST
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Populating the interactive namespace from numpy and matplotlib\n" | |
] | |
} | |
], | |
"source": [ | |
"%pylab inline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"class data():\n", | |
" def __init__(self):\n", | |
" self.data = None" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 97, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"ng_train = data()\n", | |
"ng_train.data = ['hi my name', 'hi name', 'my', 'hi my name doll hi']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 98, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['hi my name', 'hi name', 'my', 'hi my name doll hi']" | |
] | |
}, | |
"execution_count": 98, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ng_train.data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 99, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import seaborn as sns\n", | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"from scipy import stats, integrate\n", | |
"import matplotlib.pyplot as plt" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 100, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.feature_extraction.text import CountVectorizer\n", | |
"\n", | |
"# Learn ngrams of size 1,2,3\n", | |
"# Use the 20000 most common items as the vocabulary\n", | |
"counter = CountVectorizer(ngram_range=(1,3), max_features=20000)\n", | |
"\n", | |
"# Transformers also have a fit_transform method\n", | |
"# This learns a transform then retursn the transform\n", | |
"# applied to the training data\n", | |
"train_counts = counter.fit_transform(ng_train.data)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 101, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", | |
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n", | |
" lowercase=True, max_df=1.0, max_features=20000, min_df=1,\n", | |
" ngram_range=(1, 3), preprocessor=None, stop_words=None,\n", | |
" strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", | |
" tokenizer=None, vocabulary=None)" | |
] | |
}, | |
"execution_count": 101, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"counter" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 102, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"a = counter.fit(ng_train.data)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 103, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'doll': 0,\n", | |
" 'doll hi': 1,\n", | |
" 'hi': 2,\n", | |
" 'hi my': 3,\n", | |
" 'hi my name': 4,\n", | |
" 'hi name': 5,\n", | |
" 'my': 6,\n", | |
" 'my name': 7,\n", | |
" 'my name doll': 8,\n", | |
" 'name': 9,\n", | |
" 'name doll': 10,\n", | |
" 'name doll hi': 11}" | |
] | |
}, | |
"execution_count": 103, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Vocabulary Number(Index)\n", | |
"a.vocabulary_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 104, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<4x12 sparse matrix of type '<class 'numpy.int64'>'\n", | |
"\twith 21 stored elements in Compressed Sparse Row format>" | |
] | |
}, | |
"execution_count": 104, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"a.transform(ng_train.data)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 105, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"b = counter.fit_transform(ng_train.data)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 106, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(array([0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], dtype=int32),\n", | |
" array([ 4, 7, 3, 9, 6, 2, 5, 9, 2, 6, 11, 8, 1, 10, 0, 4, 7,\n", | |
" 3, 9, 6, 2], dtype=int32))" | |
] | |
}, | |
"execution_count": 106, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Nonzero Indexes\n", | |
"train_counts.nonzero()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 107, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(array([0, 0, 0, 0, 0, 0], dtype=int32),\n", | |
" array([4, 7, 3, 9, 6, 2], dtype=int32))" | |
] | |
}, | |
"execution_count": 107, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# For Doc 0, nonzero items\n", | |
"train_counts[0].nonzero()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 108, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(array([0, 0, 0], dtype=int32), array([5, 9, 2], dtype=int32))" | |
] | |
}, | |
"execution_count": 108, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train_counts[1].nonzero()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 109, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(array([0], dtype=int32), array([6], dtype=int32))" | |
] | |
}, | |
"execution_count": 109, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train_counts[2].nonzero()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 110, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),\n", | |
" array([11, 8, 1, 10, 0, 4, 7, 3, 9, 6, 2], dtype=int32))" | |
] | |
}, | |
"execution_count": 110, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train_counts[3].nonzero()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 111, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# train_counts[4].nonzero()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 112, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"matrix([[1, 1, 1, 1, 1, 1]])" | |
] | |
}, | |
"execution_count": 112, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train_counts[0][train_counts[0].nonzero()]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 113, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"matrix([[1, 1, 1]])" | |
] | |
}, | |
"execution_count": 113, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train_counts[1][train_counts[1].nonzero()]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 114, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),\n", | |
" array([11, 8, 1, 10, 0, 4, 7, 3, 9, 6, 2], dtype=int32))" | |
] | |
}, | |
"execution_count": 114, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train_counts[3].nonzero()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 115, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"matrix([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]], dtype=int64)" | |
] | |
}, | |
"execution_count": 115, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train_counts[train_counts[3].nonzero()]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 116, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"matrix([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]])" | |
] | |
}, | |
"execution_count": 116, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train_counts[3][train_counts[3].nonzero()]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment