Skip to content

Instantly share code, notes, and snippets.

@catethos
Created July 28, 2014 16:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save catethos/ab979acf3f7c818828a1 to your computer and use it in GitHub Desktop.
Save catethos/ab979acf3f7c818828a1 to your computer and use it in GitHub Desktop.
dplyr in python
{
"metadata": {
"name": "",
"signature": "sha256:3dc5f3d60796427b79e67432cd5917d24ddffaed9465d883f5e86d48eef949b6"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"dplyr inspired data manipulation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[dplyr](https://github.com/hadley/dplyr) is a new R package that provides a consistent set of interfaces for efficiently manipulating data easily. For example, the following code add a _c_ column to the dataframe _df_ which equals to the sum of columns _a_ and _b_.\n",
"\n",
"<blockquote>\n",
"mutate(df, c=a+b)\n",
"</blockquote>\n",
"\n",
"I wish to implement a similar interface in Python. The following code is a naive implementation of the _mutate_ function. It is simply a proof of concept and I have perhap used all the wrong tools to do it, for example using regex for parsing and exec to run the code. Anyway, I hope that someone has better idea to do this."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import re\n",
"import pandas as pd"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"function_regex = re.compile(r\"(?P<function>^[a-z]*)\\((?P<argument>[a-z]*)\\)$\")\n",
"operator_regex = re.compile(r\"(\\+|-|\\*|/)\")\n",
"column_regex = re.compile(r\"(\\w_?)+\")"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def m(token):\n",
" column = column_regex.match(token)\n",
" operator = operator_regex.match(token)\n",
" function = function_regex.match(token)\n",
" if function:\n",
" return \"df['{}'].apply({})\".format(function.group(\"argument\"),function.group(\"function\"))\n",
" elif operator:\n",
" return token\n",
" elif column:\n",
" return \"df['{}']\".format(token)\n",
" \n",
"def transform(f):\n",
" lhs,rhs = f.split(\"=\")\n",
" lhs = lhs.strip()\n",
" rhs = rhs.strip()\n",
" tokens = rhs.split(\" \")\n",
" return \"df['{}'] = \".format(lhs) + \"\".join([m(x) for x in tokens])\n",
"\n",
"def mutate(df,formula):\n",
" exec(transform(formula),\n",
" globals(),\n",
" {\"df\":df})\n",
" "
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Example:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"dataframe = pd.DataFrame.from_dict({\"a\":[1,2,3],\"b\":[2,4,6]})"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"dataframe"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 2</td>\n",
" <td> 4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 3</td>\n",
" <td> 6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3 rows \u00d7 2 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 5,
"text": [
" a b\n",
"0 1 2\n",
"1 2 4\n",
"2 3 6\n",
"\n",
"[3 rows x 2 columns]"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"mutate(dataframe,\n",
" \"c = a + b\")"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"dataframe"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 2</td>\n",
" <td> 4</td>\n",
" <td> 6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 3</td>\n",
" <td> 6</td>\n",
" <td> 9</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3 rows \u00d7 3 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 8,
"text": [
" a b c\n",
"0 1 2 3\n",
"1 2 4 6\n",
"2 3 6 9\n",
"\n",
"[3 rows x 3 columns]"
]
}
],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def addone(x):\n",
" return x+1"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 17
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"mutate(dataframe,\n",
" \"d = addone(b)\")"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 18
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"dataframe"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" <th>d</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" <td> 3</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 2</td>\n",
" <td> 4</td>\n",
" <td> 6</td>\n",
" <td> 5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 3</td>\n",
" <td> 6</td>\n",
" <td> 9</td>\n",
" <td> 7</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3 rows \u00d7 4 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 19,
"text": [
" a b c d\n",
"0 1 2 3 3\n",
"1 2 4 6 5\n",
"2 3 6 9 7\n",
"\n",
"[3 rows x 4 columns]"
]
}
],
"prompt_number": 19
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"mutate(dataframe,\n",
" \"e = addone(b) + c\")"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 20
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"dataframe"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" <th>d</th>\n",
" <th>e</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" <td> 3</td>\n",
" <td> 3</td>\n",
" <td> 6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 2</td>\n",
" <td> 4</td>\n",
" <td> 6</td>\n",
" <td> 5</td>\n",
" <td> 11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 3</td>\n",
" <td> 6</td>\n",
" <td> 9</td>\n",
" <td> 7</td>\n",
" <td> 16</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3 rows \u00d7 5 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 21,
"text": [
" a b c d e\n",
"0 1 2 3 3 6\n",
"1 2 4 6 5 11\n",
"2 3 6 9 7 16\n",
"\n",
"[3 rows x 5 columns]"
]
}
],
"prompt_number": 21
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment