Skip to content

Instantly share code, notes, and snippets.

@jtdoepke
Created October 4, 2016 01:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jtdoepke/73057fb784c4b3dbcb6e98a7d481c219 to your computer and use it in GitHub Desktop.
Save jtdoepke/73057fb784c4b3dbcb6e98a7d481c219 to your computer and use it in GitHub Desktop.
Testing scikit-learn ngram performance improvements
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# pip install cython jupyter numpy pandas scipy\n",
"# cd scikit-learn; python setup.py develop\n",
"\n",
"# Tested with Python 3.5.1\n",
"\n",
"import sys\n",
"import os\n",
"from time import time\n",
"from imp import reload\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"pd.options.display.float_format = '{:,.3f}'.format\n",
"\n",
"SKLEARN_DIR = \"~/Development/scikit-learn\"\n",
"sys.path.insert(0, os.path.realpath(SKLEARN_DIR))\n",
"from sklearn.utils import safe_indexing, check_random_state\n",
"import sklearn.feature_extraction.text\n",
"\n",
"\n",
"VERSIONS = [\n",
" ('base version', 'dee786ab294c9992145cf4a978ff8975b00ff7ef'), \n",
" ('method binding', '874ed30cba760775b03251100fa6a17fddfca07c'),\n",
" ('method binding + unigram list', \"ac62e1c0c4f563e29f2fb6f4653f107345e05345\"),\n",
"]\n",
"\n",
"\n",
"def git_checkout(commit_id):\n",
" !cd $SKLEARN_DIR; git checkout $commit_id;\n",
" reload(sklearn.feature_extraction.text)\n",
"\n",
" \n",
"def make_ngrams(docs, ngram_range, analyzer):\n",
" vectorizer = sklearn.feature_extraction.text.HashingVectorizer(ngram_range=ngram_range)\n",
" if analyzer == 'word':\n",
" f = vectorizer._word_ngrams\n",
" elif analyzer == 'char':\n",
" f = vectorizer._char_ngrams\n",
" elif analyzer == 'char_wb':\n",
" f = vectorizer._char_wb_ngrams\n",
" for doc in docs:\n",
" f(doc)\n",
"\n",
" \n",
"def evaluate_ngrams(docs, n_repeat, ngram_range, analyzer):\n",
" out = []\n",
" for version, commit_id in VERSIONS:\n",
" git_checkout(commit_id)\n",
" for k in range(n_repeat):\n",
" start_t = time()\n",
" make_ngrams(docs, ngram_range, analyzer)\n",
" end_t = time()\n",
" out.append({'cpu_time [s]': end_t - start_t, 'version': version, 'ngram_range': ngram_range})\n",
" return pd.DataFrame(out).groupby(by=['ngram_range', 'version'], sort=False).mean()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"WORDS = np.array(\"\"\"consider minute accord evident practice intend concern commit\n",
"issue approach establish utter conduct engage obtain scarce policy straight\n",
"stock apparent property fancy concept court appoint passage vain instance coast\n",
"project commission constant circumstances constitute level affect institute\n",
"render appeal generate theory range campaign league labor confer grant dwell \n",
"entertain contract earnest yield wander insist knight convince inspire convention\n",
"skill harry financial reflect novel furnish compel venture territory temper bent intimate\n",
"undertake majority assert crew chamber humble scheme keen liberal despair tide attitude\n",
"justify flag merit manifest notion scale formal resource persist contempt tour plead weigh\n",
"mode distinction inclined attribute exert oppress contend stake toil perish disposition\n",
"rail cardinal boast advocate bestow allege notwithstanding lofty multitude steep heed modest\n",
"partial apt esteem credible provoke tread ascertain fare cede perpetual decree contrive\n",
"derived elaborate substantial frontier facile cite warrant sob rider dense afflict\n",
"flourish ordain pious vex gravity suspended conspicuous retort jet bolt assent purse\n",
"plus sanction proceeding exalt siege malice extravagant wax throng venerate assail\n",
"sublime exploit exertion kindle endow imposed humiliate suffrage ensue brook gale muse \n",
"satire intrigue indication dispatch cower wont tract canon impel latitude vacate undertaking \n",
"slay predecessor delicacy forsake beseech philosophical grove frustrate illustrious device \n",
"pomp entreat impart propriety consecrate proceeds fathom objective clad partisan faction \n",
"contrived venerable restrained besiege manifestation rebuke insurgent rhetoric scrupulous \n",
"ratify stump discreet imposing wistful mortify ripple premise subside adverse caprice \n",
"muster comprehensive accede fervent cohere tribunal austere recovering stratum \n",
"conscientious arbitrary exasperate conjure ominous edifice elude pervade foster admonish \n",
"repeal retiring incidental acquiesce slew usurp sentinel precision depose wanton \n",
"odium precept deference fray candid enduring impertinent bland insinuate nominal \n",
"suppliant languid rave monetary headlong infallible coax explicate gaunt morbid ranging \n",
"pacify pastoral dogged ebb aide appease stipulate recourse constrained bate aversion \n",
"conceit loath rampart extort tarry perpetrate decorum luxuriant cant enjoin avarice edict \n",
"disconcert symmetry capitulate arbitrate cleave append visage horde parable chastise foil \n",
"veritable grapple gentry pall maxim projection prowess dingy semblance tout fortitude asunder \n",
"rout staid beguile purport deprave bequeath enigma assiduous vassal quail outskirts bulwark swerve \n",
"gird betrothed prospective advert peremptory rudiment deduce halting ignominy ideology pallid\"\"\".split())\n",
"\n",
"def make_documents(n_docs, min_sent_size, max_sent_size):\n",
" # Generate some random, pre-tokenized documents.\n",
" random_state = check_random_state(0)\n",
" docs = []\n",
" for _ in range(n_docs):\n",
" docs.append(list(random_state.choice(WORDS, size=random_state.randint(min_sent_size, max_sent_size + 1))))\n",
" return docs"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Previous HEAD position was ac62e1c... Improve ngram performance - unigram list\n",
"HEAD is now at dee786a... DOC fix typo in NearestNeighbors docstring (#7545)\n",
"Previous HEAD position was dee786a... DOC fix typo in NearestNeighbors docstring (#7545)\n",
"HEAD is now at 874ed30... Improve ngram performance - method binding\n",
"Previous HEAD position was 874ed30... Improve ngram performance - method binding\n",
"HEAD is now at ac62e1c... Improve ngram performance - unigram list\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>cpu_time [s]</th>\n",
" </tr>\n",
" <tr>\n",
" <th>ngram_range</th>\n",
" <th>version</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">(1, 4)</th>\n",
" <th>base version</th>\n",
" <td>1.080</td>\n",
" </tr>\n",
" <tr>\n",
" <th>method binding</th>\n",
" <td>0.897</td>\n",
" </tr>\n",
" <tr>\n",
" <th>method binding + unigram list</th>\n",
" <td>0.699</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" cpu_time [s]\n",
"ngram_range version \n",
"(1, 4) base version 1.080\n",
" method binding 0.897\n",
" method binding + unigram list 0.699"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"documents = make_documents(100000, 4, 12)\n",
"evaluate_ngrams(documents, 1000, (1, 4), 'word')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Previous HEAD position was ac62e1c... Improve ngram performance - unigram list\n",
"HEAD is now at dee786a... DOC fix typo in NearestNeighbors docstring (#7545)\n",
"Previous HEAD position was dee786a... DOC fix typo in NearestNeighbors docstring (#7545)\n",
"HEAD is now at 874ed30... Improve ngram performance - method binding\n",
"Previous HEAD position was 874ed30... Improve ngram performance - method binding\n",
"HEAD is now at ac62e1c... Improve ngram performance - unigram list\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>cpu_time [s]</th>\n",
" </tr>\n",
" <tr>\n",
" <th>ngram_range</th>\n",
" <th>version</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">(1, 4)</th>\n",
" <th>base version</th>\n",
" <td>4.335</td>\n",
" </tr>\n",
" <tr>\n",
" <th>method binding</th>\n",
" <td>3.475</td>\n",
" </tr>\n",
" <tr>\n",
" <th>method binding + unigram list</th>\n",
" <td>2.820</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" cpu_time [s]\n",
"ngram_range version \n",
"(1, 4) base version 4.335\n",
" method binding 3.475\n",
" method binding + unigram list 2.820"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"documents = [\" \".join(doc) for doc in documents]\n",
"evaluate_ngrams(documents, 1000, (1, 4), 'char')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Previous HEAD position was ac62e1c... Improve ngram performance - unigram list\n",
"HEAD is now at dee786a... DOC fix typo in NearestNeighbors docstring (#7545)\n",
"Previous HEAD position was dee786a... DOC fix typo in NearestNeighbors docstring (#7545)\n",
"HEAD is now at 874ed30... Improve ngram performance - method binding\n",
"Previous HEAD position was 874ed30... Improve ngram performance - method binding\n",
"HEAD is now at ac62e1c... Improve ngram performance - unigram list\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>cpu_time [s]</th>\n",
" </tr>\n",
" <tr>\n",
" <th>ngram_range</th>\n",
" <th>version</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">(1, 4)</th>\n",
" <th>base version</th>\n",
" <td>5.767</td>\n",
" </tr>\n",
" <tr>\n",
" <th>method binding</th>\n",
" <td>5.028</td>\n",
" </tr>\n",
" <tr>\n",
" <th>method binding + unigram list</th>\n",
" <td>5.044</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" cpu_time [s]\n",
"ngram_range version \n",
"(1, 4) base version 5.767\n",
" method binding 5.028\n",
" method binding + unigram list 5.044"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"evaluate_ngrams(documents, 1000, (1, 4), 'char_wb')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment