Skip to content

Instantly share code, notes, and snippets.

@koorukuroo
Last active September 22, 2017 01:39
Show Gist options
  • Save koorukuroo/19533f029846117888dd418712d10621 to your computer and use it in GitHub Desktop.
Save koorukuroo/19533f029846117888dd418712d10621 to your computer and use it in GitHub Desktop.
sklearn CountVectorizer fit_transform TEST
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
}
],
"source": [
"%pylab inline"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"class data():\n",
" def __init__(self):\n",
" self.data = None"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"ng_train = data()\n",
"ng_train.data = ['hi my name', 'hi name', 'my', 'hi my name doll hi']"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['hi my name', 'hi name', 'my', 'hi my name doll hi']"
]
},
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ng_train.data"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import seaborn as sns\n",
"import numpy as np\n",
"import pandas as pd\n",
"from scipy import stats, integrate\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n",
"\n",
"# Learn ngrams of size 1,2,3\n",
"# Use the 20000 most common items as the vocabulary\n",
"counter = CountVectorizer(ngram_range=(1,3), max_features=20000)\n",
"\n",
"# Transformers also have a fit_transform method\n",
"# This learns a transform then retursn the transform\n",
"# applied to the training data\n",
"train_counts = counter.fit_transform(ng_train.data)"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
" lowercase=True, max_df=1.0, max_features=20000, min_df=1,\n",
" ngram_range=(1, 3), preprocessor=None, stop_words=None,\n",
" strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
" tokenizer=None, vocabulary=None)"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"counter"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"a = counter.fit(ng_train.data)"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'doll': 0,\n",
" 'doll hi': 1,\n",
" 'hi': 2,\n",
" 'hi my': 3,\n",
" 'hi my name': 4,\n",
" 'hi name': 5,\n",
" 'my': 6,\n",
" 'my name': 7,\n",
" 'my name doll': 8,\n",
" 'name': 9,\n",
" 'name doll': 10,\n",
" 'name doll hi': 11}"
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Vocabulary Number(Index)\n",
"a.vocabulary_"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<4x12 sparse matrix of type '<class 'numpy.int64'>'\n",
"\twith 21 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a.transform(ng_train.data)"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"b = counter.fit_transform(ng_train.data)"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(array([0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], dtype=int32),\n",
" array([ 4, 7, 3, 9, 6, 2, 5, 9, 2, 6, 11, 8, 1, 10, 0, 4, 7,\n",
" 3, 9, 6, 2], dtype=int32))"
]
},
"execution_count": 106,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Nonzero Indexes\n",
"train_counts.nonzero()"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(array([0, 0, 0, 0, 0, 0], dtype=int32),\n",
" array([4, 7, 3, 9, 6, 2], dtype=int32))"
]
},
"execution_count": 107,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# For Doc 0, nonzero items\n",
"train_counts[0].nonzero()"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(array([0, 0, 0], dtype=int32), array([5, 9, 2], dtype=int32))"
]
},
"execution_count": 108,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_counts[1].nonzero()"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(array([0], dtype=int32), array([6], dtype=int32))"
]
},
"execution_count": 109,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_counts[2].nonzero()"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),\n",
" array([11, 8, 1, 10, 0, 4, 7, 3, 9, 6, 2], dtype=int32))"
]
},
"execution_count": 110,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_counts[3].nonzero()"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# train_counts[4].nonzero()"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"matrix([[1, 1, 1, 1, 1, 1]])"
]
},
"execution_count": 112,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_counts[0][train_counts[0].nonzero()]"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"matrix([[1, 1, 1]])"
]
},
"execution_count": 113,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_counts[1][train_counts[1].nonzero()]"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),\n",
" array([11, 8, 1, 10, 0, 4, 7, 3, 9, 6, 2], dtype=int32))"
]
},
"execution_count": 114,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_counts[3].nonzero()"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"matrix([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]], dtype=int64)"
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_counts[train_counts[3].nonzero()]"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"matrix([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]])"
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_counts[3][train_counts[3].nonzero()]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment