Skip to content

Instantly share code, notes, and snippets.

@pllim
Last active October 28, 2016 20:34
Show Gist options
  • Save pllim/70665a2bdc1c5a353694e7f96c166138 to your computer and use it in GitHub Desktop.
Save pllim/70665a2bdc1c5a353694e7f96c166138 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Homework to auto optimize `pandas` data frame object."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Import modules needed for this notebook\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def df_optimize_basic(df, thresh=0.1):\n",
" \"\"\"Optimize dataframe by using categorial columns.\"\"\"\n",
" c_info = df.select_dtypes(include=['object']).describe()\n",
" ratio = c_info.loc['unique'] / c_info.loc['count']\n",
" for iname in ratio[ratio < thresh].index:\n",
" df[iname] = df[iname].astype('category')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Answer from instructor, David Mertz, *\"Here's a bit more aggressive optimize() function. It gets the beer DataFrame down to less than half its original size. I'm pretty sure a wide range of queries will be considerably faster also.\"*"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def df_optimize(df, thresh=0.25):\n",
" \"\"\"Optimize dataframe by using categorial columns.\"\"\"\n",
" c_info = df.select_dtypes(include=['object']).describe()\n",
" ratio = c_info.loc['unique'] / c_info.loc['count']\n",
" for iname in ratio[ratio < thresh].index:\n",
" df[iname] = df[iname].astype('category')\n",
" \n",
" for col in df.columns:\n",
" if df[col].dtype == np.int64:\n",
" if df[col].min() > -2**15 and df[col].max() < 2**15:\n",
" df[col] = df[col].astype('int16')\n",
" elif df[col].min() > -2**31 and df[col].max() < 2**31:\n",
" df[col] = df[col].astype('int32')\n",
" if df[col].dtype in (np.float64, np.float128):\n",
" vals = np.unique(df[col])\n",
" nvals = vals[np.isfinite(vals)].size\n",
" if nvals/len(df) < thresh:\n",
" df[col] = df[col].astype('category')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Modified answer to be even more aggresive."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def df_optimize_overkill(df, thresh=0.25):\n",
" \"\"\"Optimize dataframe by using categorial columns and\n",
" demoting int data types.\n",
"\n",
" Parameters\n",
" ----------\n",
" df : DataFrame\n",
" ``pandas`` data frame. This is modified in-place.\n",
"\n",
" thresh : float, optional\n",
" Threshold of unique to total ratio to\n",
" convert column to a category.\n",
"\n",
" \"\"\"\n",
" # Convert repeating strings to category.\n",
" c_info = df.select_dtypes(include=['object']).describe()\n",
" ratio = c_info.loc['unique'] / c_info.loc['count']\n",
" for iname in ratio[ratio < thresh].index:\n",
" df[iname] = df[iname].astype('category')\n",
"\n",
" ii08 = np.iinfo(np.int8)\n",
" ii16 = np.iinfo(np.int16) \n",
" ii32 = np.iinfo(np.int32)\n",
"\n",
" fi16 = np.finfo(np.float16)\n",
" fi32 = np.finfo(np.float32)\n",
" fi64 = np.finfo(np.float64)\n",
" \n",
" for col in df.columns: \n",
" # Convert repeating numbers to category.\n",
" if df[col].dtype in (np.int8, np.int16, np.int32, np.int64,\n",
" np.float16, np.float32, np.float64, np.float128):\n",
" vals = np.unique(df[col])\n",
" nvals = np.count_nonzero(np.isfinite(vals))\n",
" if nvals / len(df) < thresh:\n",
" df[col] = df[col].astype('category')\n",
" \n",
" # Demote numerical data types if still not a category.\n",
" # Demotion is level by level until lowest possible level is reached.\n",
" if df[col].dtype in (np.int16, np.int32, np.int64):\n",
" if df[col].dtype == np.int64:\n",
" if df[col].min() > ii32.min and df[col].max() < ii32.max:\n",
" df[col] = df[col].astype('int32') \n",
" if df[col].dtype == np.int32:\n",
" if df[col].min() > ii16.min and df[col].max() < ii16.max:\n",
" df[col] = df[col].astype('int16')\n",
" else: # int16\n",
" if df[col].min() > ii08.min and df[col].max() < ii08.max:\n",
" df[col] = df[col].astype('int8')\n",
" # D. Mertz: Demoting float is a bad idea!\n",
" elif df[col].dtype in (np.float32, np.float64, np.float128):\n",
" if df[col].dtype == np.float128:\n",
" if df[col].min() > fi64.min and df[col].max() < fi64.max:\n",
" df[col] = df[col].astype('float64')\n",
" if df[col].dtype == np.float64:\n",
" if df[col].min() > fi32.min and df[col].max() < fi32.max:\n",
" df[col] = df[col].astype('float32')\n",
" else: # float32\n",
" if df[col].min() > fi16.min and df[col].max() < fi16.max:\n",
" df[col] = df[col].astype('float16') "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 50000 entries, 0 to 49999\n",
"Data columns (total 13 columns):\n",
"abv 48389 non-null float64\n",
"beer_id 50000 non-null int64\n",
"brewer_id 50000 non-null int64\n",
"beer_name 50000 non-null object\n",
"beer_style 50000 non-null object\n",
"review_appearance 50000 non-null float64\n",
"review_aroma 50000 non-null float64\n",
"review_overall 50000 non-null float64\n",
"review_palate 50000 non-null float64\n",
"profile_name 50000 non-null object\n",
"review_taste 50000 non-null float64\n",
"text 49991 non-null object\n",
"time 50000 non-null datetime64[ns]\n",
"dtypes: datetime64[ns](1), float64(6), int64(2), object(4)\n",
"memory usage: 5.3+ MB\n"
]
}
],
"source": [
"df = pd.read_csv('data/beer2.csv.gz',\n",
" index_col=0,\n",
" parse_dates=['time'])\n",
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 50000 entries, 0 to 49999\n",
"Data columns (total 13 columns):\n",
"abv 48389 non-null float64\n",
"beer_id 50000 non-null int64\n",
"brewer_id 50000 non-null int64\n",
"beer_name 50000 non-null object\n",
"beer_style 50000 non-null category\n",
"review_appearance 50000 non-null float64\n",
"review_aroma 50000 non-null float64\n",
"review_overall 50000 non-null float64\n",
"review_palate 50000 non-null float64\n",
"profile_name 50000 non-null category\n",
"review_taste 50000 non-null float64\n",
"text 49991 non-null object\n",
"time 50000 non-null datetime64[ns]\n",
"dtypes: category(2), datetime64[ns](1), float64(6), int64(2), object(2)\n",
"memory usage: 4.8+ MB\n"
]
}
],
"source": [
"# Original homework answer.\n",
"df2 = df.copy()\n",
"df_optimize_basic(df2)\n",
"df2.info()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 50000 entries, 0 to 49999\n",
"Data columns (total 13 columns):\n",
"abv 48389 non-null category\n",
"beer_id 50000 non-null int32\n",
"brewer_id 50000 non-null int16\n",
"beer_name 50000 non-null category\n",
"beer_style 50000 non-null category\n",
"review_appearance 50000 non-null category\n",
"review_aroma 50000 non-null category\n",
"review_overall 50000 non-null category\n",
"review_palate 50000 non-null category\n",
"profile_name 50000 non-null category\n",
"review_taste 50000 non-null category\n",
"text 49991 non-null object\n",
"time 50000 non-null datetime64[ns]\n",
"dtypes: category(9), datetime64[ns](1), int16(1), int32(1), object(1)\n",
"memory usage: 2.1+ MB\n"
]
}
],
"source": [
"# D. Mertz's solution.\n",
"df2 = df.copy()\n",
"df_optimize(df2)\n",
"df2.info()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 50000 entries, 0 to 49999\n",
"Data columns (total 13 columns):\n",
"abv 48389 non-null category\n",
"beer_id 50000 non-null category\n",
"brewer_id 50000 non-null category\n",
"beer_name 50000 non-null category\n",
"beer_style 50000 non-null category\n",
"review_appearance 50000 non-null category\n",
"review_aroma 50000 non-null category\n",
"review_overall 50000 non-null category\n",
"review_palate 50000 non-null category\n",
"profile_name 50000 non-null category\n",
"review_taste 50000 non-null category\n",
"text 49991 non-null object\n",
"time 50000 non-null datetime64[ns]\n",
"dtypes: category(11), datetime64[ns](1), object(1)\n",
"memory usage: 2.1+ MB\n"
]
}
],
"source": [
"# Overkill solution.\n",
"# No memory usage improvement for this test case.\n",
"df2 = df.copy()\n",
"df_optimize_overkill(df2)\n",
"df2.info()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment