Skip to content

Instantly share code, notes, and snippets.

@kspeeckaert
Created June 30, 2016 08:29
Show Gist options
  • Save kspeeckaert/957c966a5332fc6bc544617c2efad4e9 to your computer and use it in GitHub Desktop.
Save kspeeckaert/957c966a5332fc6bc544617c2efad4e9 to your computer and use it in GitHub Desktop.
Efficient query pandas dataset
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import random\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Construct the sample data (1.000.000 rows)\n",
"data = []\n",
"for i in range(1000000):\n",
" row = []\n",
" row.append(random.choice(['A1', 'A2', 'A3', 'A4', 'A5']))\n",
" row.append(random.choice(['B1', 'B2', 'B3', 'B4', 'B5']))\n",
" row.append(random.choice(['C1', 'C2', 'C3', 'C4', 'C5']))\n",
" row.append(random.choice(['D1', 'D2', 'D3', 'D4', 'D5']))\n",
" row.append(random.randint(1000, 100000))\n",
" data.append(row)\n",
"df = pd.DataFrame(data, columns=['A','B','C','D', 'val']) "
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" <th>val</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>812419</th>\n",
" <td>A1</td>\n",
" <td>B1</td>\n",
" <td>C2</td>\n",
" <td>D4</td>\n",
" <td>31882</td>\n",
" </tr>\n",
" <tr>\n",
" <th>964167</th>\n",
" <td>A5</td>\n",
" <td>B3</td>\n",
" <td>C2</td>\n",
" <td>D2</td>\n",
" <td>65951</td>\n",
" </tr>\n",
" <tr>\n",
" <th>505576</th>\n",
" <td>A1</td>\n",
" <td>B2</td>\n",
" <td>C4</td>\n",
" <td>D3</td>\n",
" <td>29178</td>\n",
" </tr>\n",
" <tr>\n",
" <th>729472</th>\n",
" <td>A4</td>\n",
" <td>B5</td>\n",
" <td>C2</td>\n",
" <td>D4</td>\n",
" <td>46592</td>\n",
" </tr>\n",
" <tr>\n",
" <th>43541</th>\n",
" <td>A4</td>\n",
" <td>B3</td>\n",
" <td>C3</td>\n",
" <td>D2</td>\n",
" <td>92247</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C D val\n",
"812419 A1 B1 C2 D4 31882\n",
"964167 A5 B3 C2 D2 65951\n",
"505576 A1 B2 C4 D3 29178\n",
"729472 A4 B5 C2 D4 46592\n",
"43541 A4 B3 C3 D2 92247"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Sample output\n",
"df.sample(5)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(1000000, 5)"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1000000 entries, 0 to 999999\n",
"Data columns (total 5 columns):\n",
"A 1000000 non-null object\n",
"B 1000000 non-null object\n",
"C 1000000 non-null object\n",
"D 1000000 non-null object\n",
"val 1000000 non-null int64\n",
"dtypes: int64(1), object(4)\n",
"memory usage: 232.7 MB\n"
]
}
],
"source": [
"# Memory usage with deep introspection\n",
"df.info(memory_usage='deep')"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10 loops, best of 3: 57 ms per loop\n"
]
}
],
"source": [
"%%timeit\n",
"df[(df['A']== 'A1')]['val'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10 loops, best of 3: 80.9 ms per loop\n"
]
}
],
"source": [
"%%timeit\n",
"df[(df['A']== 'A2') & (df['B'] == 'BA2')]['val'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10 loops, best of 3: 119 ms per loop\n"
]
}
],
"source": [
"%%timeit\n",
"df[(df['A']== 'A4') & (df['C'] == 'CA4') & (df['D'] == 'D4')]['val'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"for col in df.columns.tolist()[:-1]:\n",
" df[col] = df[col].astype('category')"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1000000 entries, 0 to 999999\n",
"Data columns (total 5 columns):\n",
"A 1000000 non-null category\n",
"B 1000000 non-null category\n",
"C 1000000 non-null category\n",
"D 1000000 non-null category\n",
"val 1000000 non-null int64\n",
"dtypes: category(4), int64(1)\n",
"memory usage: 11.4 MB\n"
]
}
],
"source": [
"df.info(memory_usage='deep')"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100 loops, best of 3: 9.36 ms per loop\n"
]
}
],
"source": [
"%%timeit\n",
"df[(df['A']== 'A1')]['val'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100 loops, best of 3: 6.79 ms per loop\n"
]
}
],
"source": [
"%%timeit\n",
"df[(df['A']== 'A2') & (df['B'] == 'BA2')]['val'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100 loops, best of 3: 9.37 ms per loop\n"
]
}
],
"source": [
"%%timeit\n",
"df[(df['A']== 'A4') & (df['C'] == 'CA4') & (df['D'] == 'D4')]['val'].mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
@kspeeckaert
Copy link
Author

This is a sample notebook to demonstrate the improvement, both in resource usage and row selection, when converting columns to categories. The gist was posted in response to this question on StackOverflow.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment