Skip to content

Instantly share code, notes, and snippets.

@fscottfoti
Last active August 29, 2015 14:01
Show Gist options
  • Save fscottfoti/1cec32e70a60e440ebce to your computer and use it in GitHub Desktop.
Save fscottfoti/1cec32e70a60e440ebce to your computer and use it in GitHub Desktop.
Is patsy slow or am I doing something wrong?
{
"metadata": {
"name": "",
"signature": "sha256:0cf6cb9492bf54a87341cd63b1eaa77ad1a76608073d08f68c5c2d41d7568686"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import numpy as np\n",
"import pandas as pd"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def rand(): return np.random.rand(1*1000000)\n",
"df = pd.DataFrame({'a': rand(), 'b': rand(), 'c': rand(), 'x': rand(), 'y': rand(), 'z': rand()})"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.describe()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" <th>x</th>\n",
" <th>y</th>\n",
" <th>z</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td> 1000000.000000</td>\n",
" <td> 1000000.000000</td>\n",
" <td> 1000000.000000</td>\n",
" <td> 1000000.000000</td>\n",
" <td> 1000000.000000</td>\n",
" <td> 1000000.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td> 0.499703</td>\n",
" <td> 0.500001</td>\n",
" <td> 0.500315</td>\n",
" <td> 0.500131</td>\n",
" <td> 0.500158</td>\n",
" <td> 0.499718</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td> 0.288530</td>\n",
" <td> 0.288911</td>\n",
" <td> 0.288757</td>\n",
" <td> 0.288680</td>\n",
" <td> 0.288796</td>\n",
" <td> 0.288611</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td> 0.000001</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000001</td>\n",
" <td> 0.000001</td>\n",
" <td> 0.000001</td>\n",
" <td> 0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td> 0.249909</td>\n",
" <td> 0.249649</td>\n",
" <td> 0.250089</td>\n",
" <td> 0.249889</td>\n",
" <td> 0.250296</td>\n",
" <td> 0.249225</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td> 0.499495</td>\n",
" <td> 0.499889</td>\n",
" <td> 0.500764</td>\n",
" <td> 0.499820</td>\n",
" <td> 0.500345</td>\n",
" <td> 0.499891</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td> 0.749365</td>\n",
" <td> 0.750536</td>\n",
" <td> 0.750163</td>\n",
" <td> 0.750429</td>\n",
" <td> 0.750276</td>\n",
" <td> 0.749692</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td> 1.000000</td>\n",
" <td> 0.999999</td>\n",
" <td> 0.999999</td>\n",
" <td> 0.999998</td>\n",
" <td> 0.999999</td>\n",
" <td> 1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8 rows \u00d7 6 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 12,
"text": [
" a b c x \\\n",
"count 1000000.000000 1000000.000000 1000000.000000 1000000.000000 \n",
"mean 0.499703 0.500001 0.500315 0.500131 \n",
"std 0.288530 0.288911 0.288757 0.288680 \n",
"min 0.000001 0.000000 0.000001 0.000001 \n",
"25% 0.249909 0.249649 0.250089 0.249889 \n",
"50% 0.499495 0.499889 0.500764 0.499820 \n",
"75% 0.749365 0.750536 0.750163 0.750429 \n",
"max 1.000000 0.999999 0.999999 0.999998 \n",
"\n",
" y z \n",
"count 1000000.000000 1000000.000000 \n",
"mean 0.500158 0.499718 \n",
"std 0.288796 0.288611 \n",
"min 0.000001 0.000000 \n",
"25% 0.250296 0.249225 \n",
"50% 0.500345 0.499891 \n",
"75% 0.750276 0.749692 \n",
"max 0.999999 1.000000 \n",
"\n",
"[8 rows x 6 columns]"
]
}
],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import patsy\n",
"%timeit patsy.dmatrix(\"a + b + I(c<.3) + I(z>.7) + np.log1p(x) + np.log1p(y)\", df)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"1 loops, best of 3: 11.5 s per loop\n"
]
}
],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def go():\n",
" df2 = pd.DataFrame(index=df.index)\n",
" df2[\"a\"] = df[\"a\"]\n",
" df2[\"b\"] = df[\"b\"]\n",
" df2[\"c\"] = df[\"c\"] < .3\n",
" df2[\"z\"] = df[\"z\"] > .7\n",
" df2[\"x\"] = df[\"x\"].apply(np.log1p)\n",
" df2[\"y\"] = df[\"y\"].apply(np.log1p)\n",
" return df2\n",
"%timeit go()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"10 loops, best of 3: 50 ms per loop\n"
]
}
],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%timeit patsy.dmatrix(\"a + b + np.log1p(x) + np.log1p(y)\", df)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"1 loops, best of 3: 296 ms per loop\n"
]
}
],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def go():\n",
" df2 = pd.DataFrame(index=df.index)\n",
" df2[\"a\"] = df[\"a\"]\n",
" df2[\"b\"] = df[\"b\"]\n",
" #df2[\"c\"] = df[\"c\"] < .3\n",
" #df2[\"z\"] = df[\"z\"] > .7\n",
" df2[\"x\"] = df[\"x\"].apply(np.log1p)\n",
" df2[\"y\"] = df[\"y\"].apply(np.log1p)\n",
" return df2\n",
"%timeit go()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"10 loops, best of 3: 42.5 ms per loop\n"
]
}
],
"prompt_number": 16
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%timeit patsy.dmatrix(\"a + b\", df)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"10 loops, best of 3: 113 ms per loop\n"
]
}
],
"prompt_number": 17
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def go():\n",
" df2 = pd.DataFrame(index=df.index)\n",
" df2[\"a\"] = df[\"a\"]\n",
" df2[\"b\"] = df[\"b\"]\n",
" #df2[\"c\"] = df[\"c\"] < .3\n",
" #df2[\"z\"] = df[\"z\"] > .7\n",
" #df2[\"x\"] = df[\"x\"].apply(np.log1p)\n",
" #df2[\"y\"] = df[\"y\"].apply(np.log1p)\n",
" return df2\n",
"%timeit go()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"100 loops, best of 3: 5.68 ms per loop\n"
]
}
],
"prompt_number": 18
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment