Skip to content

Instantly share code, notes, and snippets.

@rth
Last active September 9, 2016 15:36
Show Gist options
  • Save rth/1fba8e88d2d1f3cd3ef49b0d88a22c57 to your computer and use it in GitHub Desktop.
Save rth/1fba8e88d2d1f3cd3ef49b0d88a22c57 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Memory / Performance Benchmark for Feature Extraction in Scikit Learn"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2016-08-27 14:01:38-- https://www.cs.cmu.edu/%7Eenron/enron_mail_20150507.tgz\n",
"Resolving www.cs.cmu.edu... 128.2.217.13\n",
"Connecting to www.cs.cmu.edu|128.2.217.13|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 443254787 (423M) [application/x-tar]\n",
"Saving to: ‘enron_mail_20150507.tgz’\n",
"\n",
"enron_mail_20150507 100%[===================>] 422.72M 173KB/s in 22m 55s \n",
"\n",
"2016-08-27 14:24:34 (315 KB/s) - ‘enron_mail_20150507.tgz’ saved [443254787/443254787]\n",
"\n"
]
}
],
"source": [
"# This environement for testing was created with,\n",
"#\n",
"# conda create -n sklearn-test numpy scipy nose ipython jupyter pandas python=3.5\n",
"# source activate sklearn-test\n",
"# pip install memory_profiler psutil\n",
"# \n",
"# and scikit-learn package installed with\n",
"# python setup.py develop\n",
"#\n",
"# The benchmarks are performed with a, \n",
"# Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz, NVME SSD \n",
"# and take a few hours to run.\n",
"\n",
"\n",
"# getting the enron email dataset\n",
"!wget https://www.cs.cmu.edu/%7Eenron/enron_mail_20150507.tgz\n",
"!tar xzf enron_mail_20150507.tgz"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Importing: /home/rth/sandbox/scikit-learn/sklearn/feature_extraction/text.py\n"
]
}
],
"source": [
"import memory_profiler\n",
"import pandas as pd\n",
"import numpy as np\n",
"from imp import reload\n",
"import os\n",
"from time import time\n",
"from IPython.display import display\n",
"pd.options.display.float_format = '{:,.1f}'.format\n",
"import sklearn.feature_extraction.text\n",
"\n",
"print('Importing: {}'.format(sklearn.feature_extraction.text.__file__))\n",
"\n",
"vectorizer_versions = [('reference vers.', 'acf8368ca3b426e504c00d1df3c2d51b7fa89c65'), \n",
" #('PR #4968', '5c544e91eee6e18d38d29b523a322706426e76d4'),\n",
" #('PR #5122', \"0b2a0a07520a29ef64e2a9171151bcc580bbd6aa\"),\n",
" ('PR #5122 + Is. #5306', '1418392047968bb7d5a1ed4a5e523ec071b42adf'),\n",
" ('PR last version', \"039f7ebf31ab7334832e4751c43d70b6f53e40b6\")]\n",
"\n",
"SKLEARN_DIR = \"~/sandbox/scikit-learn\"\n",
"MAX_DOCS = 10000\n",
"\n",
"\n",
"def iterate_enron_docs(path, max_docs=None, truncate=None):\n",
" num = 0\n",
" for root, dirs, files in os.walk(path):\n",
" for name in files:\n",
" num += 1\n",
" with open(os.path.join(root, name), 'r') as fh:\n",
" try:\n",
" txt = fh.read()\n",
" if truncate is None:\n",
" yield txt\n",
" else:\n",
" yield txt[:truncate]\n",
" except UnicodeDecodeError:\n",
" pass\n",
" if max_docs is not None and num > max_docs:\n",
" break\n",
"\n",
"def git_checkout(commit_id):\n",
" !cd $SKLEARN_DIR; git checkout $commit_id; cd -;\n",
"\n",
"def count_vectorizer_memory_usage(method, max_docs, truncate):\n",
" \n",
" reload(sklearn.feature_extraction.text)\n",
" \n",
" \n",
" vect_class = getattr(sklearn.feature_extraction.text, method)\n",
" \n",
" vect = vect_class()\n",
" vect.fit_transform(iterate_enron_docs(\"./maildir/\", max_docs=max_docs, truncate=truncate))\n",
"\n",
"def measure_feature_extraction(max_docs, n_repeat, truncate=None):\n",
" out = []\n",
" for version, commit_id in vectorizer_versions:\n",
" git_checkout(commit_id)\n",
" for method in ['CountVectorizer', 'TfidfVectorizer', 'HashingVectorizer']:\n",
" for k in range(n_repeat):\n",
" t0 = time()\n",
" musage = memory_profiler.memory_usage((count_vectorizer_memory_usage, (method, max_docs, truncate)))\n",
" t1 = time()\n",
" out.append({'cpu_time [s]': t1 - t0, 'version': version, 'method': method,\n",
" 'memory_peak [MB]': max(musage) - min(musage)})\n",
"\n",
" # the seem to be a significant fluctuation from run to run for small datasets\n",
" # take an average of multiple runs\n",
" res = pd.DataFrame(out).groupby(by=['version', 'method'], sort=False).mean()\n",
" return res"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Benchmark on non truncated documents"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Document length statististics: min=383 char, mean=2719.120701086967 char, max=2011941 char\n"
]
}
],
"source": [
"docs_len = np.asarray([len(doc) for doc in iterate_enron_docs(\"./maildir/\")])\n",
"print('Document length statististics: min={} char, mean={} char, max={} char'.format(\n",
" docs_len.min(), docs_len.mean(), docs_len.max()))"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Previous HEAD position was 039f7eb... Text vecotorizer: adressing review comments\n",
"HEAD is now at acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"Previous HEAD position was acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
"HEAD is now at 1418392... Improved feature extraction performance (issue #5306)\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"Previous HEAD position was 1418392... Improved feature extraction performance (issue #5306)\n",
"HEAD is now at 039f7eb... Text vecotorizer: adressing review comments\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"\n",
"======= Dataset size: 10000, n_repeat: 3 ======\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>cpu_time [s]</th>\n",
" <th>memory_peak [MB]</th>\n",
" </tr>\n",
" <tr>\n",
" <th>version</th>\n",
" <th>method</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">reference vers.</th>\n",
" <th>CountVectorizer</th>\n",
" <td>5.5</td>\n",
" <td>98.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>5.3</td>\n",
" <td>91.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>6.3</td>\n",
" <td>78.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">PR #5122 + Is. #5306</th>\n",
" <th>CountVectorizer</th>\n",
" <td>6.2</td>\n",
" <td>62.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>6.3</td>\n",
" <td>56.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>6.1</td>\n",
" <td>69.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">PR last version</th>\n",
" <th>CountVectorizer</th>\n",
" <td>5.4</td>\n",
" <td>63.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>5.7</td>\n",
" <td>63.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>6.3</td>\n",
" <td>70.4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" cpu_time [s] memory_peak [MB]\n",
"version method \n",
"reference vers. CountVectorizer 5.5 98.4\n",
" TfidfVectorizer 5.3 91.4\n",
" HashingVectorizer 6.3 78.8\n",
"PR #5122 + Is. #5306 CountVectorizer 6.2 62.5\n",
" TfidfVectorizer 6.3 56.4\n",
" HashingVectorizer 6.1 69.4\n",
"PR last version CountVectorizer 5.4 63.1\n",
" TfidfVectorizer 5.7 63.2\n",
" HashingVectorizer 6.3 70.4"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Previous HEAD position was 039f7eb... Text vecotorizer: adressing review comments\n",
"HEAD is now at acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"Previous HEAD position was acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
"HEAD is now at 1418392... Improved feature extraction performance (issue #5306)\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"Previous HEAD position was 1418392... Improved feature extraction performance (issue #5306)\n",
"HEAD is now at 039f7eb... Text vecotorizer: adressing review comments\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"\n",
"======= Dataset size: 30000, n_repeat: 1 ======\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>cpu_time [s]</th>\n",
" <th>memory_peak [MB]</th>\n",
" </tr>\n",
" <tr>\n",
" <th>version</th>\n",
" <th>method</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">reference vers.</th>\n",
" <th>CountVectorizer</th>\n",
" <td>17.4</td>\n",
" <td>321.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>17.6</td>\n",
" <td>322.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>20.0</td>\n",
" <td>238.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">PR #5122 + Is. #5306</th>\n",
" <th>CountVectorizer</th>\n",
" <td>20.4</td>\n",
" <td>171.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>21.0</td>\n",
" <td>229.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>20.9</td>\n",
" <td>191.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">PR last version</th>\n",
" <th>CountVectorizer</th>\n",
" <td>18.0</td>\n",
" <td>191.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>18.1</td>\n",
" <td>225.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>20.2</td>\n",
" <td>199.4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" cpu_time [s] memory_peak [MB]\n",
"version method \n",
"reference vers. CountVectorizer 17.4 321.8\n",
" TfidfVectorizer 17.6 322.1\n",
" HashingVectorizer 20.0 238.4\n",
"PR #5122 + Is. #5306 CountVectorizer 20.4 171.7\n",
" TfidfVectorizer 21.0 229.6\n",
" HashingVectorizer 20.9 191.3\n",
"PR last version CountVectorizer 18.0 191.5\n",
" TfidfVectorizer 18.1 225.5\n",
" HashingVectorizer 20.2 199.4"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Previous HEAD position was 039f7eb... Text vecotorizer: adressing review comments\n",
"HEAD is now at acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"Previous HEAD position was acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
"HEAD is now at 1418392... Improved feature extraction performance (issue #5306)\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"Previous HEAD position was 1418392... Improved feature extraction performance (issue #5306)\n",
"HEAD is now at 039f7eb... Text vecotorizer: adressing review comments\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"\n",
"======= Dataset size: 50000, n_repeat: 1 ======\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>cpu_time [s]</th>\n",
" <th>memory_peak [MB]</th>\n",
" </tr>\n",
" <tr>\n",
" <th>version</th>\n",
" <th>method</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">reference vers.</th>\n",
" <th>CountVectorizer</th>\n",
" <td>28.7</td>\n",
" <td>476.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>27.8</td>\n",
" <td>476.9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>32.8</td>\n",
" <td>462.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">PR #5122 + Is. #5306</th>\n",
" <th>CountVectorizer</th>\n",
" <td>30.8</td>\n",
" <td>276.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>31.4</td>\n",
" <td>356.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>32.5</td>\n",
" <td>388.9</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">PR last version</th>\n",
" <th>CountVectorizer</th>\n",
" <td>28.9</td>\n",
" <td>306.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>31.5</td>\n",
" <td>366.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>33.3</td>\n",
" <td>363.2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" cpu_time [s] memory_peak [MB]\n",
"version method \n",
"reference vers. CountVectorizer 28.7 476.7\n",
" TfidfVectorizer 27.8 476.9\n",
" HashingVectorizer 32.8 462.2\n",
"PR #5122 + Is. #5306 CountVectorizer 30.8 276.1\n",
" TfidfVectorizer 31.4 356.6\n",
" HashingVectorizer 32.5 388.9\n",
"PR last version CountVectorizer 28.9 306.4\n",
" TfidfVectorizer 31.5 366.7\n",
" HashingVectorizer 33.3 363.2"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Previous HEAD position was 039f7eb... Text vecotorizer: adressing review comments\n",
"HEAD is now at acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"Previous HEAD position was acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
"HEAD is now at 1418392... Improved feature extraction performance (issue #5306)\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"Previous HEAD position was 1418392... Improved feature extraction performance (issue #5306)\n",
"HEAD is now at 039f7eb... Text vecotorizer: adressing review comments\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"\n",
"======= Dataset size: 100000, n_repeat: 1 ======\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>cpu_time [s]</th>\n",
" <th>memory_peak [MB]</th>\n",
" </tr>\n",
" <tr>\n",
" <th>version</th>\n",
" <th>method</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">reference vers.</th>\n",
" <th>CountVectorizer</th>\n",
" <td>66.5</td>\n",
" <td>1,056.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>67.6</td>\n",
" <td>1,051.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>75.7</td>\n",
" <td>888.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">PR #5122 + Is. #5306</th>\n",
" <th>CountVectorizer</th>\n",
" <td>69.4</td>\n",
" <td>563.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>70.2</td>\n",
" <td>729.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>71.8</td>\n",
" <td>877.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">PR last version</th>\n",
" <th>CountVectorizer</th>\n",
" <td>65.0</td>\n",
" <td>570.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>66.2</td>\n",
" <td>740.9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>74.9</td>\n",
" <td>807.3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" cpu_time [s] memory_peak [MB]\n",
"version method \n",
"reference vers. CountVectorizer 66.5 1,056.8\n",
" TfidfVectorizer 67.6 1,051.0\n",
" HashingVectorizer 75.7 888.6\n",
"PR #5122 + Is. #5306 CountVectorizer 69.4 563.8\n",
" TfidfVectorizer 70.2 729.2\n",
" HashingVectorizer 71.8 877.4\n",
"PR last version CountVectorizer 65.0 570.6\n",
" TfidfVectorizer 66.2 740.9\n",
" HashingVectorizer 74.9 807.3"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for max_docs in [10000, 30000, 50000, 100000]:\n",
" if max_docs <= 10000:\n",
" n_repeat = 3\n",
" else:\n",
" n_repeat = 1\n",
" res = measure_feature_extraction(max_docs, n_repeat, truncate=None)\n",
" print('\\n======= Dataset size: {}, n_repeat: {} ======'.format(max_docs, n_repeat))\n",
" display(res)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Benchmark on documents truncated to 200 char length"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Note: checking out 'acf8368ca3b426e504c00d1df3c2d51b7fa89c65'.\n",
"\n",
"You are in 'detached HEAD' state. You can look around, make experimental\n",
"changes and commit them, and you can discard any commits you make in this\n",
"state without impacting any branches by performing another checkout.\n",
"\n",
"If you want to create a new branch to retain commits you create, you may\n",
"do so (now or later) by using -b with the checkout command again. Example:\n",
"\n",
" git checkout -b <new-branch-name>\n",
"\n",
"HEAD is now at acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"Previous HEAD position was acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
"HEAD is now at 1418392... Improved feature extraction performance (issue #5306)\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"Previous HEAD position was 1418392... Improved feature extraction performance (issue #5306)\n",
"HEAD is now at 039f7eb... Text vecotorizer: adressing review comments\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"\n",
"======= Dataset size: 10000, n_repeat: 3 ======\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>cpu_time [s]</th>\n",
" <th>memory_peak [MB]</th>\n",
" </tr>\n",
" <tr>\n",
" <th>version</th>\n",
" <th>method</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">reference vers.</th>\n",
" <th>CountVectorizer</th>\n",
" <td>1.1</td>\n",
" <td>2.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>1.0</td>\n",
" <td>7.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">PR #5122 + Is. #5306</th>\n",
" <th>CountVectorizer</th>\n",
" <td>1.1</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>1.2</td>\n",
" <td>6.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>1.1</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">PR last version</th>\n",
" <th>CountVectorizer</th>\n",
" <td>1.2</td>\n",
" <td>0.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>1.1</td>\n",
" <td>6.9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>1.1</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" cpu_time [s] memory_peak [MB]\n",
"version method \n",
"reference vers. CountVectorizer 1.1 2.8\n",
" TfidfVectorizer 1.0 7.4\n",
" HashingVectorizer 1.0 2.0\n",
"PR #5122 + Is. #5306 CountVectorizer 1.1 1.0\n",
" TfidfVectorizer 1.2 6.3\n",
" HashingVectorizer 1.1 2.0\n",
"PR last version CountVectorizer 1.2 0.5\n",
" TfidfVectorizer 1.1 6.9\n",
" HashingVectorizer 1.1 2.0"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Previous HEAD position was 039f7eb... Text vecotorizer: adressing review comments\n",
"HEAD is now at acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"Previous HEAD position was acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
"HEAD is now at 1418392... Improved feature extraction performance (issue #5306)\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"Previous HEAD position was 1418392... Improved feature extraction performance (issue #5306)\n",
"HEAD is now at 039f7eb... Text vecotorizer: adressing review comments\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"\n",
"======= Dataset size: 30000, n_repeat: 1 ======\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>cpu_time [s]</th>\n",
" <th>memory_peak [MB]</th>\n",
" </tr>\n",
" <tr>\n",
" <th>version</th>\n",
" <th>method</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">reference vers.</th>\n",
" <th>CountVectorizer</th>\n",
" <td>3.9</td>\n",
" <td>37.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>2.8</td>\n",
" <td>38.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>3.0</td>\n",
" <td>17.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">PR #5122 + Is. #5306</th>\n",
" <th>CountVectorizer</th>\n",
" <td>3.4</td>\n",
" <td>33.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>3.4</td>\n",
" <td>41.9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>3.0</td>\n",
" <td>21.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">PR last version</th>\n",
" <th>CountVectorizer</th>\n",
" <td>3.2</td>\n",
" <td>32.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>3.1</td>\n",
" <td>36.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>3.0</td>\n",
" <td>18.4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" cpu_time [s] memory_peak [MB]\n",
"version method \n",
"reference vers. CountVectorizer 3.9 37.0\n",
" TfidfVectorizer 2.8 38.6\n",
" HashingVectorizer 3.0 17.0\n",
"PR #5122 + Is. #5306 CountVectorizer 3.4 33.8\n",
" TfidfVectorizer 3.4 41.9\n",
" HashingVectorizer 3.0 21.6\n",
"PR last version CountVectorizer 3.2 32.2\n",
" TfidfVectorizer 3.1 36.3\n",
" HashingVectorizer 3.0 18.4"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Previous HEAD position was 039f7eb... Text vecotorizer: adressing review comments\n",
"HEAD is now at acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"Previous HEAD position was acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
"HEAD is now at 1418392... Improved feature extraction performance (issue #5306)\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"Previous HEAD position was 1418392... Improved feature extraction performance (issue #5306)\n",
"HEAD is now at 039f7eb... Text vecotorizer: adressing review comments\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"\n",
"======= Dataset size: 50000, n_repeat: 1 ======\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>cpu_time [s]</th>\n",
" <th>memory_peak [MB]</th>\n",
" </tr>\n",
" <tr>\n",
" <th>version</th>\n",
" <th>method</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">reference vers.</th>\n",
" <th>CountVectorizer</th>\n",
" <td>5.8</td>\n",
" <td>61.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>5.1</td>\n",
" <td>65.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>5.1</td>\n",
" <td>16.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">PR #5122 + Is. #5306</th>\n",
" <th>CountVectorizer</th>\n",
" <td>6.2</td>\n",
" <td>57.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>6.2</td>\n",
" <td>58.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>5.5</td>\n",
" <td>34.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">PR last version</th>\n",
" <th>CountVectorizer</th>\n",
" <td>5.5</td>\n",
" <td>54.9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>5.3</td>\n",
" <td>50.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>4.9</td>\n",
" <td>39.1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" cpu_time [s] memory_peak [MB]\n",
"version method \n",
"reference vers. CountVectorizer 5.8 61.6\n",
" TfidfVectorizer 5.1 65.4\n",
" HashingVectorizer 5.1 16.0\n",
"PR #5122 + Is. #5306 CountVectorizer 6.2 57.1\n",
" TfidfVectorizer 6.2 58.6\n",
" HashingVectorizer 5.5 34.4\n",
"PR last version CountVectorizer 5.5 54.9\n",
" TfidfVectorizer 5.3 50.3\n",
" HashingVectorizer 4.9 39.1"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Previous HEAD position was 039f7eb... Text vecotorizer: adressing review comments\n",
"HEAD is now at acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"Previous HEAD position was acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
"HEAD is now at 1418392... Improved feature extraction performance (issue #5306)\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"Previous HEAD position was 1418392... Improved feature extraction performance (issue #5306)\n",
"HEAD is now at 039f7eb... Text vecotorizer: adressing review comments\n",
"/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
"\n",
"======= Dataset size: 100000, n_repeat: 1 ======\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>cpu_time [s]</th>\n",
" <th>memory_peak [MB]</th>\n",
" </tr>\n",
" <tr>\n",
" <th>version</th>\n",
" <th>method</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">reference vers.</th>\n",
" <th>CountVectorizer</th>\n",
" <td>12.7</td>\n",
" <td>131.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>10.1</td>\n",
" <td>148.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>10.1</td>\n",
" <td>32.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">PR #5122 + Is. #5306</th>\n",
" <th>CountVectorizer</th>\n",
" <td>12.4</td>\n",
" <td>124.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>12.5</td>\n",
" <td>151.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>10.3</td>\n",
" <td>66.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">PR last version</th>\n",
" <th>CountVectorizer</th>\n",
" <td>11.3</td>\n",
" <td>116.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TfidfVectorizer</th>\n",
" <td>11.4</td>\n",
" <td>140.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HashingVectorizer</th>\n",
" <td>9.9</td>\n",
" <td>66.4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" cpu_time [s] memory_peak [MB]\n",
"version method \n",
"reference vers. CountVectorizer 12.7 131.1\n",
" TfidfVectorizer 10.1 148.1\n",
" HashingVectorizer 10.1 32.0\n",
"PR #5122 + Is. #5306 CountVectorizer 12.4 124.7\n",
" TfidfVectorizer 12.5 151.7\n",
" HashingVectorizer 10.3 66.4\n",
"PR last version CountVectorizer 11.3 116.3\n",
" TfidfVectorizer 11.4 140.7\n",
" HashingVectorizer 9.9 66.4"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for max_docs in [10000, 30000, 50000, 100000]:\n",
" if max_docs <= 10000:\n",
" n_repeat = 3\n",
" else:\n",
" n_repeat = 1\n",
" res = measure_feature_extraction(max_docs, n_repeat, truncate=200)\n",
" print('\\n======= Dataset size: {}, n_repeat: {} ======'.format(max_docs, n_repeat))\n",
" display(res)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Performance profiling"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" "
]
}
],
"source": [
"%prun -s cumtime count_vectorizer_memory_usage('TfidfVectorizer', max_docs=10000, truncate=None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
" 314601 function calls (314082 primitive calls) in 5.733 seconds\n",
"\n",
" Ordered by: cumulative time\n",
"\n",
" ncalls tottime percall cumtime percall filename:lineno(function)\n",
" 2/1 0.000 0.000 5.733 5.733 {built-in method builtins.exec}\n",
" 1 0.011 0.011 5.733 5.733 <string>:1(<module>)\n",
" 1 0.000 0.000 5.722 5.722 <ipython-input-17-bbcb8b5f61f7>:43(count_vectorizer_memory_usage)\n",
" 1 0.000 0.000 5.721 5.721 text.py:1316(fit_transform)\n",
" 1 0.014 0.014 5.642 5.642 text.py:799(fit_transform)\n",
" 1 1.618 1.618 5.232 5.232 text.py:735(_count_vocab)\n",
" 10009 0.026 0.000 2.537 0.000 text.py:240(<lambda>)\n",
" 10009 0.006 0.000 2.454 0.000 text.py:216(<lambda>)\n",
" 10009 2.448 0.000 2.448 0.000 {method 'findall' of '_sre.SRE_Pattern' objects}\n",
" 10010 0.128 0.000 0.572 0.000 <ipython-input-17-bbcb8b5f61f7>:23(iterate_enron_docs)\n",
" 1 0.045 0.045 0.269 0.269 text.py:680(_sort_features)\n",
" 10009 0.184 0.000 0.221 0.000 {built-in method io.open}\n",
" 1 0.216 0.216 0.216 0.216 {built-in method builtins.sorted}\n",
" 10009 0.205 0.000 0.205 0.000 {method 'extend' of 'array.array' objects}\n",
" 10009 0.126 0.000 0.157 0.000 {method 'read' of '_io.TextIOWrapper' objects}\n",
" 153 0.142 0.001 0.142 0.001 {built-in method numpy.core.multiarray.array}\n",
" 119 0.000 0.000 0.140 0.001 numeric.py:414(asarray)\n",
" 1 0.064 0.064 0.128 0.128 text.py:694(_limit_features)\n",
" 3 0.000 0.000 0.110 0.037 base.py:326(__mul__)\n",
" 2 0.000 0.000 0.106 0.053 compressed.py:471(_mul_sparse_matrix)\n",
" 1 0.000 0.000 0.095 0.095 compressed.py:1012(sort_indices)\n",
" 1 0.095 0.095 0.095 0.095 {built-in method scipy.sparse._sparsetools.csr_sort_indices}\n",
" 2 0.080 0.040 0.080 0.040 {built-in method scipy.sparse._sparsetools.csr_matmat_pass2}\n",
" 1 0.000 0.000 0.066 0.066 text.py:1023(transform)\n",
" 10009 0.058 0.000 0.058 0.000 {method 'extend' of 'list' objects}\n",
" 1 0.000 0.000 0.052 0.052 csr.py:236(__getitem__)\n",
" 10178 0.025 0.000 0.043 0.000 posixpath.py:71(join)\n",
" 10009 0.008 0.000 0.034 0.000 text.py:207(<lambda>)\n",
" 10009 0.018 0.000 0.031 0.000 codecs.py:318(decode)\n",
" 10009 0.024 0.000 0.024 0.000 {method 'lower' of 'str' objects}\n",
" 2 0.024 0.012 0.024 0.012 {built-in method scipy.sparse._sparsetools.csr_matmat_pass1}\n",
" 690/174 0.019 0.000 0.024 0.000 os.py:298(walk)\n",
" 10009 0.010 0.000 0.022 0.000 _bootlocale.py:23(getpreferredencoding)\n",
" 10009 0.013 0.000 0.017 0.000 text.py:105(decode)\n",
" 10009 0.009 0.000 0.015 0.000 codecs.py:308(__init__)\n",
" 2 0.000 0.000 0.014 0.007 text.py:493(_document_frequency)\n",
" 2 0.014 0.007 0.014 0.007 {built-in method numpy.core.multiarray.bincount}\n",
" 10009 0.013 0.000 0.013 0.000 {built-in method _codecs.utf_8_decode}\n",
" 10009 0.012 0.000 0.012 0.000 {built-in method _locale.nl_langinfo}\n",
" 1 0.002 0.002 0.012 0.012 text.py:997(fit)\n",
" 10178 0.006 0.000 0.010 0.000 posixpath.py:39(_get_sep)\n",
" 1 0.008 0.008 0.008 0.008 {method 'take' of 'numpy.ndarray' objects}\n",
" 1 0.000 0.000 0.008 0.008 data.py:1300(normalize)\n",
" 20301 0.007 0.000 0.007 0.000 {built-in method builtins.isinstance}\n",
" 11/10 0.000 0.000 0.007 0.001 compressed.py:24(__init__)\n",
" 10009 0.006 0.000 0.006 0.000 text.py:126(_word_ngrams)\n",
" 1 0.006 0.006 0.006 0.006 {sklearn.utils.sparsefuncs_fast.inplace_csr_row_normalize_l2}\n",
" 10009 0.006 0.000 0.006 0.000 codecs.py:259(__init__)\n",
" 10178 0.005 0.000 0.005 0.000 {method 'startswith' of 'str' objects}\n",
" 1 0.000 0.000 0.005 0.005 compressed.py:556(sum)\n",
" 1 0.000 0.000 0.005 0.005 base.py:795(sum)\n",
" 1 0.000 0.000 0.004 0.004 base.py:413(__rmul__)\n",
" 1 0.000 0.000 0.004 0.004 compressed.py:445(_mul_vector)\n",
" 1 0.004 0.004 0.004 0.004 {built-in method scipy.sparse._sparsetools.csc_matvec}\n",
" 4 0.000 0.000 0.003 0.001 base.py:236(asformat)\n",
" 10179 0.003 0.000 0.003 0.000 {method 'endswith' of 'str' objects}\n",
" 10155 0.003 0.000 0.003 0.000 {built-in method builtins.len}\n",
" 1 0.000 0.000 0.003 0.003 construct.py:26(spdiags)\n",
" 1 0.000 0.000 0.002 0.002 base.py:728(tocsr)\n",
" 10010 0.002 0.000 0.002 0.000 {method 'append' of 'array.array' objects}\n",
" 10009 0.002 0.000 0.002 0.000 {method 'keys' of 'dict' objects}\n",
" 10 0.002 0.000 0.002 0.000 {method 'reduce' of 'numpy.ufunc' objects}\n",
" 1 0.001 0.001 0.002 0.002 dia.py:345(tocoo)\n",
" 169 0.000 0.000 0.002 0.000 posixpath.py:158(islink)\n",
" 10009 0.002 0.000 0.002 0.000 text.py:191(<lambda>)\n",
" 1 0.000 0.000 0.002 0.002 imp.py:306(reload)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [conda env:sklearn-test]",
"language": "python",
"name": "conda-env-sklearn-test-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment