rth/CountVectorizer_sklearn_performance_benchmark.ipynb

## CountVectorizer_sklearn_performance_benchmark.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Memory / Performance Benchmark for Feature Extraction in Scikit Learn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2016-08-27 14:01:38--  https://www.cs.cmu.edu/%7Eenron/enron_mail_20150507.tgz\n",
      "Resolving www.cs.cmu.edu... 128.2.217.13\n",
      "Connecting to www.cs.cmu.edu|128.2.217.13|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 443254787 (423M) [application/x-tar]\n",
      "Saving to: ‘enron_mail_20150507.tgz’\n",
      "\n",
      "enron_mail_20150507 100%[===================>] 422.72M   173KB/s    in 22m 55s \n",
      "\n",
      "2016-08-27 14:24:34 (315 KB/s) - ‘enron_mail_20150507.tgz’ saved [443254787/443254787]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# This environement for testing was created with,\n",
    "#\n",
    "#      conda create -n sklearn-test numpy scipy nose ipython jupyter pandas python=3.5\n",
    "#      source activate sklearn-test\n",
    "#      pip install memory_profiler psutil\n",
    "# \n",
    "# and scikit-learn package installed with\n",
    "#      python setup.py develop\n",
    "#\n",
    "# The benchmarks are performed with a, \n",
    "#      Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz, NVME SSD \n",
    "# and take a few hours to run.\n",
    "\n",
    "\n",
    "# getting the enron email dataset\n",
    "!wget https://www.cs.cmu.edu/%7Eenron/enron_mail_20150507.tgz\n",
    "!tar xzf enron_mail_20150507.tgz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Importing: /home/rth/sandbox/scikit-learn/sklearn/feature_extraction/text.py\n"
     ]
    }
   ],
   "source": [
    "import memory_profiler\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from imp import reload\n",
    "import os\n",
    "from time import time\n",
    "from IPython.display import display\n",
    "pd.options.display.float_format = '{:,.1f}'.format\n",
    "import sklearn.feature_extraction.text\n",
    "\n",
    "print('Importing: {}'.format(sklearn.feature_extraction.text.__file__))\n",
    "\n",
    "vectorizer_versions = [('reference vers.', 'acf8368ca3b426e504c00d1df3c2d51b7fa89c65'),                       \n",
    "                       #('PR #4968', '5c544e91eee6e18d38d29b523a322706426e76d4'),\n",
    "                       #('PR #5122', \"0b2a0a07520a29ef64e2a9171151bcc580bbd6aa\"),\n",
    "                       ('PR #5122 + Is. #5306', '1418392047968bb7d5a1ed4a5e523ec071b42adf'),\n",
    "                       ('PR last version', \"039f7ebf31ab7334832e4751c43d70b6f53e40b6\")]\n",
    "\n",
    "SKLEARN_DIR = \"~/sandbox/scikit-learn\"\n",
    "MAX_DOCS = 10000\n",
    "\n",
    "\n",
    "def iterate_enron_docs(path, max_docs=None, truncate=None):\n",
    "    num = 0\n",
    "    for root, dirs, files in os.walk(path):\n",
    "        for name in files:\n",
    "            num += 1\n",
    "            with open(os.path.join(root, name), 'r') as fh:\n",
    "                try:\n",
    "                    txt =  fh.read()\n",
    "                    if truncate is None:\n",
    "                        yield txt\n",
    "                    else:\n",
    "                        yield txt[:truncate]\n",
    "                except UnicodeDecodeError:\n",
    "                    pass\n",
    "        if max_docs is not None and num > max_docs:\n",
    "            break\n",
    "\n",
    "def git_checkout(commit_id):\n",
    "    !cd $SKLEARN_DIR; git checkout $commit_id; cd -;\n",
    "\n",
    "def count_vectorizer_memory_usage(method, max_docs, truncate):\n",
    "    \n",
    "    reload(sklearn.feature_extraction.text)\n",
    "    \n",
    "    \n",
    "    vect_class = getattr(sklearn.feature_extraction.text, method)\n",
    "    \n",
    "    vect = vect_class()\n",
    "    vect.fit_transform(iterate_enron_docs(\"./maildir/\", max_docs=max_docs, truncate=truncate))\n",
    "\n",
    "def measure_feature_extraction(max_docs, n_repeat, truncate=None):\n",
    "    out = []\n",
    "    for version, commit_id in vectorizer_versions:\n",
    "        git_checkout(commit_id)\n",
    "        for method in ['CountVectorizer', 'TfidfVectorizer', 'HashingVectorizer']:\n",
    "            for k in range(n_repeat):\n",
    "                t0 = time()\n",
    "                musage = memory_profiler.memory_usage((count_vectorizer_memory_usage, (method, max_docs, truncate)))\n",
    "                t1 = time()\n",
    "                out.append({'cpu_time [s]': t1 - t0, 'version': version, 'method': method,\n",
    "                                'memory_peak [MB]': max(musage) - min(musage)})\n",
    "\n",
    "    # the seem to be a significant fluctuation from run to run for small datasets\n",
    "    # take an average of multiple runs\n",
    "    res = pd.DataFrame(out).groupby(by=['version', 'method'], sort=False).mean()\n",
    "    return res"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Benchmark on non truncated documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Document length statististics: min=383 char, mean=2719.120701086967 char, max=2011941 char\n"
     ]
    }
   ],
   "source": [
    "docs_len = np.asarray([len(doc) for doc in iterate_enron_docs(\"./maildir/\")])\n",
    "print('Document length statististics: min={} char, mean={} char, max={} char'.format(\n",
    "    docs_len.min(), docs_len.mean(), docs_len.max()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Previous HEAD position was 039f7eb... Text vecotorizer: adressing review comments\n",
      "HEAD is now at acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "Previous HEAD position was acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
      "HEAD is now at 1418392... Improved feature extraction performance (issue #5306)\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "Previous HEAD position was 1418392... Improved feature extraction performance (issue #5306)\n",
      "HEAD is now at 039f7eb... Text vecotorizer: adressing review comments\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "\n",
      "======= Dataset size: 10000, n_repeat: 3 ======\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>cpu_time [s]</th>\n",
       "      <th>memory_peak [MB]</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>version</th>\n",
       "      <th>method</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">reference vers.</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>5.5</td>\n",
       "      <td>98.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>5.3</td>\n",
       "      <td>91.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>6.3</td>\n",
       "      <td>78.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">PR #5122 + Is. #5306</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>6.2</td>\n",
       "      <td>62.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>6.3</td>\n",
       "      <td>56.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>6.1</td>\n",
       "      <td>69.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">PR last version</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>5.4</td>\n",
       "      <td>63.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>5.7</td>\n",
       "      <td>63.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>6.3</td>\n",
       "      <td>70.4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        cpu_time [s]  memory_peak [MB]\n",
       "version              method                                           \n",
       "reference vers.      CountVectorizer             5.5              98.4\n",
       "                     TfidfVectorizer             5.3              91.4\n",
       "                     HashingVectorizer           6.3              78.8\n",
       "PR #5122 + Is. #5306 CountVectorizer             6.2              62.5\n",
       "                     TfidfVectorizer             6.3              56.4\n",
       "                     HashingVectorizer           6.1              69.4\n",
       "PR last version      CountVectorizer             5.4              63.1\n",
       "                     TfidfVectorizer             5.7              63.2\n",
       "                     HashingVectorizer           6.3              70.4"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Previous HEAD position was 039f7eb... Text vecotorizer: adressing review comments\n",
      "HEAD is now at acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "Previous HEAD position was acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
      "HEAD is now at 1418392... Improved feature extraction performance (issue #5306)\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "Previous HEAD position was 1418392... Improved feature extraction performance (issue #5306)\n",
      "HEAD is now at 039f7eb... Text vecotorizer: adressing review comments\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "\n",
      "======= Dataset size: 30000, n_repeat: 1 ======\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>cpu_time [s]</th>\n",
       "      <th>memory_peak [MB]</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>version</th>\n",
       "      <th>method</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">reference vers.</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>17.4</td>\n",
       "      <td>321.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>17.6</td>\n",
       "      <td>322.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>20.0</td>\n",
       "      <td>238.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">PR #5122 + Is. #5306</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>20.4</td>\n",
       "      <td>171.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>21.0</td>\n",
       "      <td>229.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>20.9</td>\n",
       "      <td>191.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">PR last version</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>18.0</td>\n",
       "      <td>191.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>18.1</td>\n",
       "      <td>225.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>20.2</td>\n",
       "      <td>199.4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        cpu_time [s]  memory_peak [MB]\n",
       "version              method                                           \n",
       "reference vers.      CountVectorizer            17.4             321.8\n",
       "                     TfidfVectorizer            17.6             322.1\n",
       "                     HashingVectorizer          20.0             238.4\n",
       "PR #5122 + Is. #5306 CountVectorizer            20.4             171.7\n",
       "                     TfidfVectorizer            21.0             229.6\n",
       "                     HashingVectorizer          20.9             191.3\n",
       "PR last version      CountVectorizer            18.0             191.5\n",
       "                     TfidfVectorizer            18.1             225.5\n",
       "                     HashingVectorizer          20.2             199.4"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Previous HEAD position was 039f7eb... Text vecotorizer: adressing review comments\n",
      "HEAD is now at acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "Previous HEAD position was acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
      "HEAD is now at 1418392... Improved feature extraction performance (issue #5306)\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "Previous HEAD position was 1418392... Improved feature extraction performance (issue #5306)\n",
      "HEAD is now at 039f7eb... Text vecotorizer: adressing review comments\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "\n",
      "======= Dataset size: 50000, n_repeat: 1 ======\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>cpu_time [s]</th>\n",
       "      <th>memory_peak [MB]</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>version</th>\n",
       "      <th>method</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">reference vers.</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>28.7</td>\n",
       "      <td>476.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>27.8</td>\n",
       "      <td>476.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>32.8</td>\n",
       "      <td>462.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">PR #5122 + Is. #5306</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>30.8</td>\n",
       "      <td>276.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>31.4</td>\n",
       "      <td>356.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>32.5</td>\n",
       "      <td>388.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">PR last version</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>28.9</td>\n",
       "      <td>306.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>31.5</td>\n",
       "      <td>366.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>33.3</td>\n",
       "      <td>363.2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        cpu_time [s]  memory_peak [MB]\n",
       "version              method                                           \n",
       "reference vers.      CountVectorizer            28.7             476.7\n",
       "                     TfidfVectorizer            27.8             476.9\n",
       "                     HashingVectorizer          32.8             462.2\n",
       "PR #5122 + Is. #5306 CountVectorizer            30.8             276.1\n",
       "                     TfidfVectorizer            31.4             356.6\n",
       "                     HashingVectorizer          32.5             388.9\n",
       "PR last version      CountVectorizer            28.9             306.4\n",
       "                     TfidfVectorizer            31.5             366.7\n",
       "                     HashingVectorizer          33.3             363.2"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Previous HEAD position was 039f7eb... Text vecotorizer: adressing review comments\n",
      "HEAD is now at acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "Previous HEAD position was acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
      "HEAD is now at 1418392... Improved feature extraction performance (issue #5306)\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "Previous HEAD position was 1418392... Improved feature extraction performance (issue #5306)\n",
      "HEAD is now at 039f7eb... Text vecotorizer: adressing review comments\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "\n",
      "======= Dataset size: 100000, n_repeat: 1 ======\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>cpu_time [s]</th>\n",
       "      <th>memory_peak [MB]</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>version</th>\n",
       "      <th>method</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">reference vers.</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>66.5</td>\n",
       "      <td>1,056.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>67.6</td>\n",
       "      <td>1,051.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>75.7</td>\n",
       "      <td>888.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">PR #5122 + Is. #5306</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>69.4</td>\n",
       "      <td>563.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>70.2</td>\n",
       "      <td>729.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>71.8</td>\n",
       "      <td>877.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">PR last version</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>65.0</td>\n",
       "      <td>570.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>66.2</td>\n",
       "      <td>740.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>74.9</td>\n",
       "      <td>807.3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        cpu_time [s]  memory_peak [MB]\n",
       "version              method                                           \n",
       "reference vers.      CountVectorizer            66.5           1,056.8\n",
       "                     TfidfVectorizer            67.6           1,051.0\n",
       "                     HashingVectorizer          75.7             888.6\n",
       "PR #5122 + Is. #5306 CountVectorizer            69.4             563.8\n",
       "                     TfidfVectorizer            70.2             729.2\n",
       "                     HashingVectorizer          71.8             877.4\n",
       "PR last version      CountVectorizer            65.0             570.6\n",
       "                     TfidfVectorizer            66.2             740.9\n",
       "                     HashingVectorizer          74.9             807.3"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "for max_docs in [10000, 30000, 50000, 100000]:\n",
    "    if max_docs <= 10000:\n",
    "        n_repeat = 3\n",
    "    else:\n",
    "        n_repeat = 1\n",
    "    res = measure_feature_extraction(max_docs, n_repeat, truncate=None)\n",
    "    print('\\n======= Dataset size: {}, n_repeat: {} ======'.format(max_docs, n_repeat))\n",
    "    display(res)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Benchmark on documents truncated to 200 char length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Note: checking out 'acf8368ca3b426e504c00d1df3c2d51b7fa89c65'.\n",
      "\n",
      "You are in 'detached HEAD' state. You can look around, make experimental\n",
      "changes and commit them, and you can discard any commits you make in this\n",
      "state without impacting any branches by performing another checkout.\n",
      "\n",
      "If you want to create a new branch to retain commits you create, you may\n",
      "do so (now or later) by using -b with the checkout command again. Example:\n",
      "\n",
      "  git checkout -b <new-branch-name>\n",
      "\n",
      "HEAD is now at acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "Previous HEAD position was acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
      "HEAD is now at 1418392... Improved feature extraction performance (issue #5306)\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "Previous HEAD position was 1418392... Improved feature extraction performance (issue #5306)\n",
      "HEAD is now at 039f7eb... Text vecotorizer: adressing review comments\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "\n",
      "======= Dataset size: 10000, n_repeat: 3 ======\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>cpu_time [s]</th>\n",
       "      <th>memory_peak [MB]</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>version</th>\n",
       "      <th>method</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">reference vers.</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>1.1</td>\n",
       "      <td>2.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>1.0</td>\n",
       "      <td>7.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">PR #5122 + Is. #5306</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>1.1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>1.2</td>\n",
       "      <td>6.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>1.1</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">PR last version</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>1.2</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>1.1</td>\n",
       "      <td>6.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>1.1</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        cpu_time [s]  memory_peak [MB]\n",
       "version              method                                           \n",
       "reference vers.      CountVectorizer             1.1               2.8\n",
       "                     TfidfVectorizer             1.0               7.4\n",
       "                     HashingVectorizer           1.0               2.0\n",
       "PR #5122 + Is. #5306 CountVectorizer             1.1               1.0\n",
       "                     TfidfVectorizer             1.2               6.3\n",
       "                     HashingVectorizer           1.1               2.0\n",
       "PR last version      CountVectorizer             1.2               0.5\n",
       "                     TfidfVectorizer             1.1               6.9\n",
       "                     HashingVectorizer           1.1               2.0"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Previous HEAD position was 039f7eb... Text vecotorizer: adressing review comments\n",
      "HEAD is now at acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "Previous HEAD position was acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
      "HEAD is now at 1418392... Improved feature extraction performance (issue #5306)\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "Previous HEAD position was 1418392... Improved feature extraction performance (issue #5306)\n",
      "HEAD is now at 039f7eb... Text vecotorizer: adressing review comments\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "\n",
      "======= Dataset size: 30000, n_repeat: 1 ======\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>cpu_time [s]</th>\n",
       "      <th>memory_peak [MB]</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>version</th>\n",
       "      <th>method</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">reference vers.</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>3.9</td>\n",
       "      <td>37.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>2.8</td>\n",
       "      <td>38.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>3.0</td>\n",
       "      <td>17.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">PR #5122 + Is. #5306</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>3.4</td>\n",
       "      <td>33.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>3.4</td>\n",
       "      <td>41.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>3.0</td>\n",
       "      <td>21.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">PR last version</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>3.2</td>\n",
       "      <td>32.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>3.1</td>\n",
       "      <td>36.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>3.0</td>\n",
       "      <td>18.4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        cpu_time [s]  memory_peak [MB]\n",
       "version              method                                           \n",
       "reference vers.      CountVectorizer             3.9              37.0\n",
       "                     TfidfVectorizer             2.8              38.6\n",
       "                     HashingVectorizer           3.0              17.0\n",
       "PR #5122 + Is. #5306 CountVectorizer             3.4              33.8\n",
       "                     TfidfVectorizer             3.4              41.9\n",
       "                     HashingVectorizer           3.0              21.6\n",
       "PR last version      CountVectorizer             3.2              32.2\n",
       "                     TfidfVectorizer             3.1              36.3\n",
       "                     HashingVectorizer           3.0              18.4"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Previous HEAD position was 039f7eb... Text vecotorizer: adressing review comments\n",
      "HEAD is now at acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "Previous HEAD position was acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
      "HEAD is now at 1418392... Improved feature extraction performance (issue #5306)\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "Previous HEAD position was 1418392... Improved feature extraction performance (issue #5306)\n",
      "HEAD is now at 039f7eb... Text vecotorizer: adressing review comments\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "\n",
      "======= Dataset size: 50000, n_repeat: 1 ======\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>cpu_time [s]</th>\n",
       "      <th>memory_peak [MB]</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>version</th>\n",
       "      <th>method</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">reference vers.</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>5.8</td>\n",
       "      <td>61.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>5.1</td>\n",
       "      <td>65.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>5.1</td>\n",
       "      <td>16.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">PR #5122 + Is. #5306</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>6.2</td>\n",
       "      <td>57.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>6.2</td>\n",
       "      <td>58.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>5.5</td>\n",
       "      <td>34.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">PR last version</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>5.5</td>\n",
       "      <td>54.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>5.3</td>\n",
       "      <td>50.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>4.9</td>\n",
       "      <td>39.1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        cpu_time [s]  memory_peak [MB]\n",
       "version              method                                           \n",
       "reference vers.      CountVectorizer             5.8              61.6\n",
       "                     TfidfVectorizer             5.1              65.4\n",
       "                     HashingVectorizer           5.1              16.0\n",
       "PR #5122 + Is. #5306 CountVectorizer             6.2              57.1\n",
       "                     TfidfVectorizer             6.2              58.6\n",
       "                     HashingVectorizer           5.5              34.4\n",
       "PR last version      CountVectorizer             5.5              54.9\n",
       "                     TfidfVectorizer             5.3              50.3\n",
       "                     HashingVectorizer           4.9              39.1"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Previous HEAD position was 039f7eb... Text vecotorizer: adressing review comments\n",
      "HEAD is now at acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "Previous HEAD position was acf8368... [MRG + 1] doc: specifically address reinforcement learning in faq (#6479)\n",
      "HEAD is now at 1418392... Improved feature extraction performance (issue #5306)\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "Previous HEAD position was 1418392... Improved feature extraction performance (issue #5306)\n",
      "HEAD is now at 039f7eb... Text vecotorizer: adressing review comments\n",
      "/home/rth/data-science/sklearn-gist/CountVectorizer_MemoryUse\n",
      "\n",
      "======= Dataset size: 100000, n_repeat: 1 ======\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>cpu_time [s]</th>\n",
       "      <th>memory_peak [MB]</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>version</th>\n",
       "      <th>method</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">reference vers.</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>12.7</td>\n",
       "      <td>131.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>10.1</td>\n",
       "      <td>148.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>10.1</td>\n",
       "      <td>32.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">PR #5122 + Is. #5306</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>12.4</td>\n",
       "      <td>124.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>12.5</td>\n",
       "      <td>151.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>10.3</td>\n",
       "      <td>66.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">PR last version</th>\n",
       "      <th>CountVectorizer</th>\n",
       "      <td>11.3</td>\n",
       "      <td>116.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TfidfVectorizer</th>\n",
       "      <td>11.4</td>\n",
       "      <td>140.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HashingVectorizer</th>\n",
       "      <td>9.9</td>\n",
       "      <td>66.4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        cpu_time [s]  memory_peak [MB]\n",
       "version              method                                           \n",
       "reference vers.      CountVectorizer            12.7             131.1\n",
       "                     TfidfVectorizer            10.1             148.1\n",
       "                     HashingVectorizer          10.1              32.0\n",
       "PR #5122 + Is. #5306 CountVectorizer            12.4             124.7\n",
       "                     TfidfVectorizer            12.5             151.7\n",
       "                     HashingVectorizer          10.3              66.4\n",
       "PR last version      CountVectorizer            11.3             116.3\n",
       "                     TfidfVectorizer            11.4             140.7\n",
       "                     HashingVectorizer           9.9              66.4"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "for max_docs in [10000, 30000, 50000, 100000]:\n",
    "    if max_docs <= 10000:\n",
    "        n_repeat = 3\n",
    "    else:\n",
    "        n_repeat = 1\n",
    "    res = measure_feature_extraction(max_docs, n_repeat, truncate=200)\n",
    "    print('\\n======= Dataset size: {}, n_repeat: {} ======'.format(max_docs, n_repeat))\n",
    "    display(res)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Performance profiling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " "
     ]
    }
   ],
   "source": [
    "%prun -s cumtime count_vectorizer_memory_usage('TfidfVectorizer', max_docs=10000, truncate=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "         314601 function calls (314082 primitive calls) in 5.733 seconds\n",
    "\n",
    "   Ordered by: cumulative time\n",
    "\n",
    "   ncalls  tottime  percall  cumtime  percall filename:lineno(function)\n",
    "      2/1    0.000    0.000    5.733    5.733 {built-in method builtins.exec}\n",
    "        1    0.011    0.011    5.733    5.733 <string>:1(<module>)\n",
    "        1    0.000    0.000    5.722    5.722 <ipython-input-17-bbcb8b5f61f7>:43(count_vectorizer_memory_usage)\n",
    "        1    0.000    0.000    5.721    5.721 text.py:1316(fit_transform)\n",
    "        1    0.014    0.014    5.642    5.642 text.py:799(fit_transform)\n",
    "        1    1.618    1.618    5.232    5.232 text.py:735(_count_vocab)\n",
    "    10009    0.026    0.000    2.537    0.000 text.py:240(<lambda>)\n",
    "    10009    0.006    0.000    2.454    0.000 text.py:216(<lambda>)\n",
    "    10009    2.448    0.000    2.448    0.000 {method 'findall' of '_sre.SRE_Pattern' objects}\n",
    "    10010    0.128    0.000    0.572    0.000 <ipython-input-17-bbcb8b5f61f7>:23(iterate_enron_docs)\n",
    "        1    0.045    0.045    0.269    0.269 text.py:680(_sort_features)\n",
    "    10009    0.184    0.000    0.221    0.000 {built-in method io.open}\n",
    "        1    0.216    0.216    0.216    0.216 {built-in method builtins.sorted}\n",
    "    10009    0.205    0.000    0.205    0.000 {method 'extend' of 'array.array' objects}\n",
    "    10009    0.126    0.000    0.157    0.000 {method 'read' of '_io.TextIOWrapper' objects}\n",
    "      153    0.142    0.001    0.142    0.001 {built-in method numpy.core.multiarray.array}\n",
    "      119    0.000    0.000    0.140    0.001 numeric.py:414(asarray)\n",
    "        1    0.064    0.064    0.128    0.128 text.py:694(_limit_features)\n",
    "        3    0.000    0.000    0.110    0.037 base.py:326(__mul__)\n",
    "        2    0.000    0.000    0.106    0.053 compressed.py:471(_mul_sparse_matrix)\n",
    "        1    0.000    0.000    0.095    0.095 compressed.py:1012(sort_indices)\n",
    "        1    0.095    0.095    0.095    0.095 {built-in method scipy.sparse._sparsetools.csr_sort_indices}\n",
    "        2    0.080    0.040    0.080    0.040 {built-in method scipy.sparse._sparsetools.csr_matmat_pass2}\n",
    "        1    0.000    0.000    0.066    0.066 text.py:1023(transform)\n",
    "    10009    0.058    0.000    0.058    0.000 {method 'extend' of 'list' objects}\n",
    "        1    0.000    0.000    0.052    0.052 csr.py:236(__getitem__)\n",
    "    10178    0.025    0.000    0.043    0.000 posixpath.py:71(join)\n",
    "    10009    0.008    0.000    0.034    0.000 text.py:207(<lambda>)\n",
    "    10009    0.018    0.000    0.031    0.000 codecs.py:318(decode)\n",
    "    10009    0.024    0.000    0.024    0.000 {method 'lower' of 'str' objects}\n",
    "        2    0.024    0.012    0.024    0.012 {built-in method scipy.sparse._sparsetools.csr_matmat_pass1}\n",
    "  690/174    0.019    0.000    0.024    0.000 os.py:298(walk)\n",
    "    10009    0.010    0.000    0.022    0.000 _bootlocale.py:23(getpreferredencoding)\n",
    "    10009    0.013    0.000    0.017    0.000 text.py:105(decode)\n",
    "    10009    0.009    0.000    0.015    0.000 codecs.py:308(__init__)\n",
    "        2    0.000    0.000    0.014    0.007 text.py:493(_document_frequency)\n",
    "        2    0.014    0.007    0.014    0.007 {built-in method numpy.core.multiarray.bincount}\n",
    "    10009    0.013    0.000    0.013    0.000 {built-in method _codecs.utf_8_decode}\n",
    "    10009    0.012    0.000    0.012    0.000 {built-in method _locale.nl_langinfo}\n",
    "        1    0.002    0.002    0.012    0.012 text.py:997(fit)\n",
    "    10178    0.006    0.000    0.010    0.000 posixpath.py:39(_get_sep)\n",
    "        1    0.008    0.008    0.008    0.008 {method 'take' of 'numpy.ndarray' objects}\n",
    "        1    0.000    0.000    0.008    0.008 data.py:1300(normalize)\n",
    "    20301    0.007    0.000    0.007    0.000 {built-in method builtins.isinstance}\n",
    "    11/10    0.000    0.000    0.007    0.001 compressed.py:24(__init__)\n",
    "    10009    0.006    0.000    0.006    0.000 text.py:126(_word_ngrams)\n",
    "        1    0.006    0.006    0.006    0.006 {sklearn.utils.sparsefuncs_fast.inplace_csr_row_normalize_l2}\n",
    "    10009    0.006    0.000    0.006    0.000 codecs.py:259(__init__)\n",
    "    10178    0.005    0.000    0.005    0.000 {method 'startswith' of 'str' objects}\n",
    "        1    0.000    0.000    0.005    0.005 compressed.py:556(sum)\n",
    "        1    0.000    0.000    0.005    0.005 base.py:795(sum)\n",
    "        1    0.000    0.000    0.004    0.004 base.py:413(__rmul__)\n",
    "        1    0.000    0.000    0.004    0.004 compressed.py:445(_mul_vector)\n",
    "        1    0.004    0.004    0.004    0.004 {built-in method scipy.sparse._sparsetools.csc_matvec}\n",
    "        4    0.000    0.000    0.003    0.001 base.py:236(asformat)\n",
    "    10179    0.003    0.000    0.003    0.000 {method 'endswith' of 'str' objects}\n",
    "    10155    0.003    0.000    0.003    0.000 {built-in method builtins.len}\n",
    "        1    0.000    0.000    0.003    0.003 construct.py:26(spdiags)\n",
    "        1    0.000    0.000    0.002    0.002 base.py:728(tocsr)\n",
    "    10010    0.002    0.000    0.002    0.000 {method 'append' of 'array.array' objects}\n",
    "    10009    0.002    0.000    0.002    0.000 {method 'keys' of 'dict' objects}\n",
    "       10    0.002    0.000    0.002    0.000 {method 'reduce' of 'numpy.ufunc' objects}\n",
    "        1    0.001    0.001    0.002    0.002 dia.py:345(tocoo)\n",
    "      169    0.000    0.000    0.002    0.000 posixpath.py:158(islink)\n",
    "    10009    0.002    0.000    0.002    0.000 text.py:191(<lambda>)\n",
    "        1    0.000    0.000    0.002    0.002 imp.py:306(reload)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda env:sklearn-test]",
   "language": "python",
   "name": "conda-env-sklearn-test-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}