Created
July 29, 2015 06:50
-
-
Save aflaxman/f40ecd3c3eb33f5ab8d2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:b5b921536b689777efd9a4206a2dcc835ee97838cc2794927e83a37efba0ea29" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"!date\n", | |
"import numpy as np, pandas as pd, pymc3 as pm, matplotlib.pyplot as plt, seaborn as sns\n", | |
"%matplotlib inline\n", | |
"sns.set_context('paper')\n", | |
"sns.set_style('darkgrid')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Tue Jul 28 23:47:38 PDT 2015\r\n" | |
] | |
} | |
], | |
"prompt_number": 55 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Pandas DataFrame got some cool new methods in version 0.13\n", | |
"\n", | |
"I'm a little late to this party, but I'm going to have some fun\n", | |
"\n", | |
" df.query\n", | |
" df.eval" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"pd.__version__" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 56, | |
"text": [ | |
"'0.16.2'" | |
] | |
} | |
], | |
"prompt_number": 56 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"n = 1e8 # rows of data\n", | |
"\n", | |
"np.random.seed(12345) # set seed for reproducibility\n", | |
"\n", | |
"df = pd.DataFrame(np.random.uniform(size=(n,3)), columns=['x', 'y', 'z'])\n", | |
"df.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>x</th>\n", | |
" <th>y</th>\n", | |
" <th>z</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0.929616</td>\n", | |
" <td>0.316376</td>\n", | |
" <td>0.183919</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>0.204560</td>\n", | |
" <td>0.567725</td>\n", | |
" <td>0.595545</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>0.964515</td>\n", | |
" <td>0.653177</td>\n", | |
" <td>0.748907</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>0.653570</td>\n", | |
" <td>0.747715</td>\n", | |
" <td>0.961307</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>0.008388</td>\n", | |
" <td>0.106444</td>\n", | |
" <td>0.298704</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 57, | |
"text": [ | |
" x y z\n", | |
"0 0.929616 0.316376 0.183919\n", | |
"1 0.204560 0.567725 0.595545\n", | |
"2 0.964515 0.653177 0.748907\n", | |
"3 0.653570 0.747715 0.961307\n", | |
"4 0.008388 0.106444 0.298704" | |
] | |
} | |
], | |
"prompt_number": 57 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"For large dataframes, `eval` and `query` make things somewhat faster:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%timeit df.x + df.y**2" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"1 loops, best of 3: 407 ms per loop\n" | |
] | |
} | |
], | |
"prompt_number": 58 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%timeit df.eval('x + y**2')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"10 loops, best of 3: 100 ms per loop\n" | |
] | |
} | |
], | |
"prompt_number": 59 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%timeit df[df.x + df.y**2 < 1.]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"1 loops, best of 3: 4.69 s per loop\n" | |
] | |
} | |
], | |
"prompt_number": 60 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%timeit df.query('x + y**2 < 1.')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"1 loops, best of 3: 2.86 s per loop\n" | |
] | |
} | |
], | |
"prompt_number": 61 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"In my opinion, they make things clearer, too:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df = pd.read_stata('/homes/abie/irq_inj/Iraq_Injury_Aug_14_CLEANED.dta')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 62 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"serious_fall_expr = \"\"\"\n", | |
"(type == 'Fall' or Typeothers == 'fall' or Typeothers == 2)\n", | |
"and (\n", | |
" death == 'Yes' or\n", | |
" hospitalization == 'First hospitalization needed for' or\n", | |
" hospitalization == 'Second hospitalization needed for' or \n", | |
" hospitalization == 'Third hospitalization needed for' or \n", | |
" surgeryrequired == 'During the procedure the patient was awake (local anesthetic)' or\n", | |
" surgeryrequired == 'During the procedure the patient was given a general anesthetic?'\n", | |
")\n", | |
"\"\"\"\n", | |
"\n", | |
"df.eval(serious_fall_expr.replace('\\n', ' ')).sum()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 64, | |
"text": [ | |
"95" | |
] | |
} | |
], | |
"prompt_number": 64 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Cool that it gets displayed as a notebook in a Gist, too, and with comments.