Last active
August 29, 2015 14:01
-
-
Save fscottfoti/1cec32e70a60e440ebce to your computer and use it in GitHub Desktop.
Is patsy slow or am I doing something wrong?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:0cf6cb9492bf54a87341cd63b1eaa77ad1a76608073d08f68c5c2d41d7568686" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import numpy as np\n", | |
"import pandas as pd" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def rand(): return np.random.rand(1*1000000)\n", | |
"df = pd.DataFrame({'a': rand(), 'b': rand(), 'c': rand(), 'x': rand(), 'y': rand(), 'z': rand()})" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df.describe()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>a</th>\n", | |
" <th>b</th>\n", | |
" <th>c</th>\n", | |
" <th>x</th>\n", | |
" <th>y</th>\n", | |
" <th>z</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" <td> 1000000.000000</td>\n", | |
" <td> 1000000.000000</td>\n", | |
" <td> 1000000.000000</td>\n", | |
" <td> 1000000.000000</td>\n", | |
" <td> 1000000.000000</td>\n", | |
" <td> 1000000.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>mean</th>\n", | |
" <td> 0.499703</td>\n", | |
" <td> 0.500001</td>\n", | |
" <td> 0.500315</td>\n", | |
" <td> 0.500131</td>\n", | |
" <td> 0.500158</td>\n", | |
" <td> 0.499718</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>std</th>\n", | |
" <td> 0.288530</td>\n", | |
" <td> 0.288911</td>\n", | |
" <td> 0.288757</td>\n", | |
" <td> 0.288680</td>\n", | |
" <td> 0.288796</td>\n", | |
" <td> 0.288611</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>min</th>\n", | |
" <td> 0.000001</td>\n", | |
" <td> 0.000000</td>\n", | |
" <td> 0.000001</td>\n", | |
" <td> 0.000001</td>\n", | |
" <td> 0.000001</td>\n", | |
" <td> 0.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25%</th>\n", | |
" <td> 0.249909</td>\n", | |
" <td> 0.249649</td>\n", | |
" <td> 0.250089</td>\n", | |
" <td> 0.249889</td>\n", | |
" <td> 0.250296</td>\n", | |
" <td> 0.249225</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>50%</th>\n", | |
" <td> 0.499495</td>\n", | |
" <td> 0.499889</td>\n", | |
" <td> 0.500764</td>\n", | |
" <td> 0.499820</td>\n", | |
" <td> 0.500345</td>\n", | |
" <td> 0.499891</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>75%</th>\n", | |
" <td> 0.749365</td>\n", | |
" <td> 0.750536</td>\n", | |
" <td> 0.750163</td>\n", | |
" <td> 0.750429</td>\n", | |
" <td> 0.750276</td>\n", | |
" <td> 0.749692</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>max</th>\n", | |
" <td> 1.000000</td>\n", | |
" <td> 0.999999</td>\n", | |
" <td> 0.999999</td>\n", | |
" <td> 0.999998</td>\n", | |
" <td> 0.999999</td>\n", | |
" <td> 1.000000</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>8 rows \u00d7 6 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 12, | |
"text": [ | |
" a b c x \\\n", | |
"count 1000000.000000 1000000.000000 1000000.000000 1000000.000000 \n", | |
"mean 0.499703 0.500001 0.500315 0.500131 \n", | |
"std 0.288530 0.288911 0.288757 0.288680 \n", | |
"min 0.000001 0.000000 0.000001 0.000001 \n", | |
"25% 0.249909 0.249649 0.250089 0.249889 \n", | |
"50% 0.499495 0.499889 0.500764 0.499820 \n", | |
"75% 0.749365 0.750536 0.750163 0.750429 \n", | |
"max 1.000000 0.999999 0.999999 0.999998 \n", | |
"\n", | |
" y z \n", | |
"count 1000000.000000 1000000.000000 \n", | |
"mean 0.500158 0.499718 \n", | |
"std 0.288796 0.288611 \n", | |
"min 0.000001 0.000000 \n", | |
"25% 0.250296 0.249225 \n", | |
"50% 0.500345 0.499891 \n", | |
"75% 0.750276 0.749692 \n", | |
"max 0.999999 1.000000 \n", | |
"\n", | |
"[8 rows x 6 columns]" | |
] | |
} | |
], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import patsy\n", | |
"%timeit patsy.dmatrix(\"a + b + I(c<.3) + I(z>.7) + np.log1p(x) + np.log1p(y)\", df)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"1 loops, best of 3: 11.5 s per loop\n" | |
] | |
} | |
], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def go():\n", | |
" df2 = pd.DataFrame(index=df.index)\n", | |
" df2[\"a\"] = df[\"a\"]\n", | |
" df2[\"b\"] = df[\"b\"]\n", | |
" df2[\"c\"] = df[\"c\"] < .3\n", | |
" df2[\"z\"] = df[\"z\"] > .7\n", | |
" df2[\"x\"] = df[\"x\"].apply(np.log1p)\n", | |
" df2[\"y\"] = df[\"y\"].apply(np.log1p)\n", | |
" return df2\n", | |
"%timeit go()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"10 loops, best of 3: 50 ms per loop\n" | |
] | |
} | |
], | |
"prompt_number": 14 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%timeit patsy.dmatrix(\"a + b + np.log1p(x) + np.log1p(y)\", df)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"1 loops, best of 3: 296 ms per loop\n" | |
] | |
} | |
], | |
"prompt_number": 15 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def go():\n", | |
" df2 = pd.DataFrame(index=df.index)\n", | |
" df2[\"a\"] = df[\"a\"]\n", | |
" df2[\"b\"] = df[\"b\"]\n", | |
" #df2[\"c\"] = df[\"c\"] < .3\n", | |
" #df2[\"z\"] = df[\"z\"] > .7\n", | |
" df2[\"x\"] = df[\"x\"].apply(np.log1p)\n", | |
" df2[\"y\"] = df[\"y\"].apply(np.log1p)\n", | |
" return df2\n", | |
"%timeit go()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"10 loops, best of 3: 42.5 ms per loop\n" | |
] | |
} | |
], | |
"prompt_number": 16 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%timeit patsy.dmatrix(\"a + b\", df)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"10 loops, best of 3: 113 ms per loop\n" | |
] | |
} | |
], | |
"prompt_number": 17 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def go():\n", | |
" df2 = pd.DataFrame(index=df.index)\n", | |
" df2[\"a\"] = df[\"a\"]\n", | |
" df2[\"b\"] = df[\"b\"]\n", | |
" #df2[\"c\"] = df[\"c\"] < .3\n", | |
" #df2[\"z\"] = df[\"z\"] > .7\n", | |
" #df2[\"x\"] = df[\"x\"].apply(np.log1p)\n", | |
" #df2[\"y\"] = df[\"y\"].apply(np.log1p)\n", | |
" return df2\n", | |
"%timeit go()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"100 loops, best of 3: 5.68 ms per loop\n" | |
] | |
} | |
], | |
"prompt_number": 18 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment