Created
September 2, 2015 21:49
-
-
Save mcs07/e3269201884844f5871e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Set up a simple DataFrame with some dummy data:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df = pd.DataFrame({\n", | |
" 'apKa': [9.82, 6.34, 10.38, 10.39, 5.12, 13.25],\n", | |
" 'bpKa': [-8.43, -6.55, -5.18, 8.45, 9.21, 1.79]\n", | |
"})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>apKa</th>\n", | |
" <th>bpKa</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 9.82</td>\n", | |
" <td>-8.43</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 6.34</td>\n", | |
" <td>-6.55</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 10.38</td>\n", | |
" <td>-5.18</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 10.39</td>\n", | |
" <td> 8.45</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> 5.12</td>\n", | |
" <td> 9.21</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td> 13.25</td>\n", | |
" <td> 1.79</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" apKa bpKa\n", | |
"0 9.82 -8.43\n", | |
"1 6.34 -6.55\n", | |
"2 10.38 -5.18\n", | |
"3 10.39 8.45\n", | |
"4 5.12 9.21\n", | |
"5 13.25 1.79" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"It's very easy to generate columns that are simple mathematical operations or thresholds:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df['Base'] = df['bpKa'] > 8\n", | |
"df['Acid'] = df['apKa'] < 6.5" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>apKa</th>\n", | |
" <th>bpKa</th>\n", | |
" <th>Base</th>\n", | |
" <th>Acid</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 9.82</td>\n", | |
" <td>-8.43</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 6.34</td>\n", | |
" <td>-6.55</td>\n", | |
" <td> False</td>\n", | |
" <td> True</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 10.38</td>\n", | |
" <td>-5.18</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 10.39</td>\n", | |
" <td> 8.45</td>\n", | |
" <td> True</td>\n", | |
" <td> False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> 5.12</td>\n", | |
" <td> 9.21</td>\n", | |
" <td> True</td>\n", | |
" <td> True</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td> 13.25</td>\n", | |
" <td> 1.79</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" apKa bpKa Base Acid\n", | |
"0 9.82 -8.43 False False\n", | |
"1 6.34 -6.55 False True\n", | |
"2 10.38 -5.18 False False\n", | |
"3 10.39 8.45 True False\n", | |
"4 5.12 9.21 True True\n", | |
"5 13.25 1.79 False False" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"For more complex situations, we can use the the `apply` method with `axis=1`. This allows us to generate a value using an arbitrary function on the existing row:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>apKa</th>\n", | |
" <th>bpKa</th>\n", | |
" <th>Base</th>\n", | |
" <th>Acid</th>\n", | |
" <th>ABZN</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 9.82</td>\n", | |
" <td>-8.43</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" <td> Neutral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 6.34</td>\n", | |
" <td>-6.55</td>\n", | |
" <td> False</td>\n", | |
" <td> True</td>\n", | |
" <td> Acid</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 10.38</td>\n", | |
" <td>-5.18</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" <td> Neutral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 10.39</td>\n", | |
" <td> 8.45</td>\n", | |
" <td> True</td>\n", | |
" <td> False</td>\n", | |
" <td> Base</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> 5.12</td>\n", | |
" <td> 9.21</td>\n", | |
" <td> True</td>\n", | |
" <td> True</td>\n", | |
" <td> Zwitterion</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td> 13.25</td>\n", | |
" <td> 1.79</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" <td> Neutral</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" apKa bpKa Base Acid ABZN\n", | |
"0 9.82 -8.43 False False Neutral\n", | |
"1 6.34 -6.55 False True Acid\n", | |
"2 10.38 -5.18 False False Neutral\n", | |
"3 10.39 8.45 True False Base\n", | |
"4 5.12 9.21 True True Zwitterion\n", | |
"5 13.25 1.79 False False Neutral" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"def calculate_ABZN(row):\n", | |
" if row['Acid'] and row['Base']:\n", | |
" return 'Zwitterion'\n", | |
" elif row['Acid'] and not row['Base']:\n", | |
" return 'Acid'\n", | |
" elif not row['Acid'] and row['Base']:\n", | |
" return 'Base'\n", | |
" elif not row['Acid'] and not row['Base']:\n", | |
" return 'Neutral'\n", | |
" \n", | |
"df['ABZN'] = df.apply(calculate_ABZN, axis=1)\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"If you don't actually need the `Acid` and `Base` columns, you could calculate `ABZN` straight from the `apKa` and `bpKa` columns in a similar way." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Alternatively we could use the `map` function:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>apKa</th>\n", | |
" <th>bpKa</th>\n", | |
" <th>Base</th>\n", | |
" <th>Acid</th>\n", | |
" <th>ABZN</th>\n", | |
" <th>ABZN2</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 9.82</td>\n", | |
" <td>-8.43</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" <td> Neutral</td>\n", | |
" <td> Neutral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 6.34</td>\n", | |
" <td>-6.55</td>\n", | |
" <td> False</td>\n", | |
" <td> True</td>\n", | |
" <td> Acid</td>\n", | |
" <td> Acid</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 10.38</td>\n", | |
" <td>-5.18</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" <td> Neutral</td>\n", | |
" <td> Neutral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 10.39</td>\n", | |
" <td> 8.45</td>\n", | |
" <td> True</td>\n", | |
" <td> False</td>\n", | |
" <td> Base</td>\n", | |
" <td> Base</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> 5.12</td>\n", | |
" <td> 9.21</td>\n", | |
" <td> True</td>\n", | |
" <td> True</td>\n", | |
" <td> Zwitterion</td>\n", | |
" <td> Zwitterion</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td> 13.25</td>\n", | |
" <td> 1.79</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" <td> Neutral</td>\n", | |
" <td> Neutral</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" apKa bpKa Base Acid ABZN ABZN2\n", | |
"0 9.82 -8.43 False False Neutral Neutral\n", | |
"1 6.34 -6.55 False True Acid Acid\n", | |
"2 10.38 -5.18 False False Neutral Neutral\n", | |
"3 10.39 8.45 True False Base Base\n", | |
"4 5.12 9.21 True True Zwitterion Zwitterion\n", | |
"5 13.25 1.79 False False Neutral Neutral" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"lookup = {(True, True): 'Zwitterion', (True, False): 'Acid', (False, True): 'Base', (False, False): 'Neutral'}\n", | |
"\n", | |
"def calculate_ABZN2(acid, base):\n", | |
" return lookup[(acid, base)]\n", | |
" \n", | |
"df['ABZN2'] = map(calculate_ABZN2, df['Acid'], df['Base'])\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"The `ABZN` column contains a string and therefore is an object type:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"apKa float64\n", | |
"bpKa float64\n", | |
"Base bool\n", | |
"Acid bool\n", | |
"ABZN object\n", | |
"ABZN2 object\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.dtypes" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"It may be useful to use the pandas categorical functionality: <http://pandas.pydata.org/pandas-docs/stable/categorical.html>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df['ABZN'] = df['ABZN'].astype('category')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"apKa float64\n", | |
"bpKa float64\n", | |
"Base bool\n", | |
"Acid bool\n", | |
"ABZN category\n", | |
"ABZN2 object\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.dtypes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Index([u'Acid', u'Base', u'Neutral', u'Zwitterion'], dtype='object')" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df['ABZN'].cat.categories" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment