Last active
October 28, 2016 20:34
-
-
Save pllim/70665a2bdc1c5a353694e7f96c166138 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Homework to auto optimize `pandas` data frame object." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Import modules needed for this notebook\n", | |
"import pandas as pd\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def df_optimize_basic(df, thresh=0.1):\n", | |
" \"\"\"Optimize dataframe by using categorial columns.\"\"\"\n", | |
" c_info = df.select_dtypes(include=['object']).describe()\n", | |
" ratio = c_info.loc['unique'] / c_info.loc['count']\n", | |
" for iname in ratio[ratio < thresh].index:\n", | |
" df[iname] = df[iname].astype('category')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Answer from instructor, David Mertz, *\"Here's a bit more aggressive optimize() function. It gets the beer DataFrame down to less than half its original size. I'm pretty sure a wide range of queries will be considerably faster also.\"*" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def df_optimize(df, thresh=0.25):\n", | |
" \"\"\"Optimize dataframe by using categorial columns.\"\"\"\n", | |
" c_info = df.select_dtypes(include=['object']).describe()\n", | |
" ratio = c_info.loc['unique'] / c_info.loc['count']\n", | |
" for iname in ratio[ratio < thresh].index:\n", | |
" df[iname] = df[iname].astype('category')\n", | |
" \n", | |
" for col in df.columns:\n", | |
" if df[col].dtype == np.int64:\n", | |
" if df[col].min() > -2**15 and df[col].max() < 2**15:\n", | |
" df[col] = df[col].astype('int16')\n", | |
" elif df[col].min() > -2**31 and df[col].max() < 2**31:\n", | |
" df[col] = df[col].astype('int32')\n", | |
" if df[col].dtype in (np.float64, np.float128):\n", | |
" vals = np.unique(df[col])\n", | |
" nvals = vals[np.isfinite(vals)].size\n", | |
" if nvals/len(df) < thresh:\n", | |
" df[col] = df[col].astype('category')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Modified answer to be even more aggresive." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def df_optimize_overkill(df, thresh=0.25):\n", | |
" \"\"\"Optimize dataframe by using categorial columns and\n", | |
" demoting int data types.\n", | |
"\n", | |
" Parameters\n", | |
" ----------\n", | |
" df : DataFrame\n", | |
" ``pandas`` data frame. This is modified in-place.\n", | |
"\n", | |
" thresh : float, optional\n", | |
" Threshold of unique to total ratio to\n", | |
" convert column to a category.\n", | |
"\n", | |
" \"\"\"\n", | |
" # Convert repeating strings to category.\n", | |
" c_info = df.select_dtypes(include=['object']).describe()\n", | |
" ratio = c_info.loc['unique'] / c_info.loc['count']\n", | |
" for iname in ratio[ratio < thresh].index:\n", | |
" df[iname] = df[iname].astype('category')\n", | |
"\n", | |
" ii08 = np.iinfo(np.int8)\n", | |
" ii16 = np.iinfo(np.int16) \n", | |
" ii32 = np.iinfo(np.int32)\n", | |
"\n", | |
" fi16 = np.finfo(np.float16)\n", | |
" fi32 = np.finfo(np.float32)\n", | |
" fi64 = np.finfo(np.float64)\n", | |
" \n", | |
" for col in df.columns: \n", | |
" # Convert repeating numbers to category.\n", | |
" if df[col].dtype in (np.int8, np.int16, np.int32, np.int64,\n", | |
" np.float16, np.float32, np.float64, np.float128):\n", | |
" vals = np.unique(df[col])\n", | |
" nvals = np.count_nonzero(np.isfinite(vals))\n", | |
" if nvals / len(df) < thresh:\n", | |
" df[col] = df[col].astype('category')\n", | |
" \n", | |
" # Demote numerical data types if still not a category.\n", | |
" # Demotion is level by level until lowest possible level is reached.\n", | |
" if df[col].dtype in (np.int16, np.int32, np.int64):\n", | |
" if df[col].dtype == np.int64:\n", | |
" if df[col].min() > ii32.min and df[col].max() < ii32.max:\n", | |
" df[col] = df[col].astype('int32') \n", | |
" if df[col].dtype == np.int32:\n", | |
" if df[col].min() > ii16.min and df[col].max() < ii16.max:\n", | |
" df[col] = df[col].astype('int16')\n", | |
" else: # int16\n", | |
" if df[col].min() > ii08.min and df[col].max() < ii08.max:\n", | |
" df[col] = df[col].astype('int8')\n", | |
" # D. Mertz: Demoting float is a bad idea!\n", | |
" elif df[col].dtype in (np.float32, np.float64, np.float128):\n", | |
" if df[col].dtype == np.float128:\n", | |
" if df[col].min() > fi64.min and df[col].max() < fi64.max:\n", | |
" df[col] = df[col].astype('float64')\n", | |
" if df[col].dtype == np.float64:\n", | |
" if df[col].min() > fi32.min and df[col].max() < fi32.max:\n", | |
" df[col] = df[col].astype('float32')\n", | |
" else: # float32\n", | |
" if df[col].min() > fi16.min and df[col].max() < fi16.max:\n", | |
" df[col] = df[col].astype('float16') " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 50000 entries, 0 to 49999\n", | |
"Data columns (total 13 columns):\n", | |
"abv 48389 non-null float64\n", | |
"beer_id 50000 non-null int64\n", | |
"brewer_id 50000 non-null int64\n", | |
"beer_name 50000 non-null object\n", | |
"beer_style 50000 non-null object\n", | |
"review_appearance 50000 non-null float64\n", | |
"review_aroma 50000 non-null float64\n", | |
"review_overall 50000 non-null float64\n", | |
"review_palate 50000 non-null float64\n", | |
"profile_name 50000 non-null object\n", | |
"review_taste 50000 non-null float64\n", | |
"text 49991 non-null object\n", | |
"time 50000 non-null datetime64[ns]\n", | |
"dtypes: datetime64[ns](1), float64(6), int64(2), object(4)\n", | |
"memory usage: 5.3+ MB\n" | |
] | |
} | |
], | |
"source": [ | |
"df = pd.read_csv('data/beer2.csv.gz',\n", | |
" index_col=0,\n", | |
" parse_dates=['time'])\n", | |
"df.info()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 50000 entries, 0 to 49999\n", | |
"Data columns (total 13 columns):\n", | |
"abv 48389 non-null float64\n", | |
"beer_id 50000 non-null int64\n", | |
"brewer_id 50000 non-null int64\n", | |
"beer_name 50000 non-null object\n", | |
"beer_style 50000 non-null category\n", | |
"review_appearance 50000 non-null float64\n", | |
"review_aroma 50000 non-null float64\n", | |
"review_overall 50000 non-null float64\n", | |
"review_palate 50000 non-null float64\n", | |
"profile_name 50000 non-null category\n", | |
"review_taste 50000 non-null float64\n", | |
"text 49991 non-null object\n", | |
"time 50000 non-null datetime64[ns]\n", | |
"dtypes: category(2), datetime64[ns](1), float64(6), int64(2), object(2)\n", | |
"memory usage: 4.8+ MB\n" | |
] | |
} | |
], | |
"source": [ | |
"# Original homework answer.\n", | |
"df2 = df.copy()\n", | |
"df_optimize_basic(df2)\n", | |
"df2.info()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 50000 entries, 0 to 49999\n", | |
"Data columns (total 13 columns):\n", | |
"abv 48389 non-null category\n", | |
"beer_id 50000 non-null int32\n", | |
"brewer_id 50000 non-null int16\n", | |
"beer_name 50000 non-null category\n", | |
"beer_style 50000 non-null category\n", | |
"review_appearance 50000 non-null category\n", | |
"review_aroma 50000 non-null category\n", | |
"review_overall 50000 non-null category\n", | |
"review_palate 50000 non-null category\n", | |
"profile_name 50000 non-null category\n", | |
"review_taste 50000 non-null category\n", | |
"text 49991 non-null object\n", | |
"time 50000 non-null datetime64[ns]\n", | |
"dtypes: category(9), datetime64[ns](1), int16(1), int32(1), object(1)\n", | |
"memory usage: 2.1+ MB\n" | |
] | |
} | |
], | |
"source": [ | |
"# D. Mertz's solution.\n", | |
"df2 = df.copy()\n", | |
"df_optimize(df2)\n", | |
"df2.info()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 50000 entries, 0 to 49999\n", | |
"Data columns (total 13 columns):\n", | |
"abv 48389 non-null category\n", | |
"beer_id 50000 non-null category\n", | |
"brewer_id 50000 non-null category\n", | |
"beer_name 50000 non-null category\n", | |
"beer_style 50000 non-null category\n", | |
"review_appearance 50000 non-null category\n", | |
"review_aroma 50000 non-null category\n", | |
"review_overall 50000 non-null category\n", | |
"review_palate 50000 non-null category\n", | |
"profile_name 50000 non-null category\n", | |
"review_taste 50000 non-null category\n", | |
"text 49991 non-null object\n", | |
"time 50000 non-null datetime64[ns]\n", | |
"dtypes: category(11), datetime64[ns](1), object(1)\n", | |
"memory usage: 2.1+ MB\n" | |
] | |
} | |
], | |
"source": [ | |
"# Overkill solution.\n", | |
"# No memory usage improvement for this test case.\n", | |
"df2 = df.copy()\n", | |
"df_optimize_overkill(df2)\n", | |
"df2.info()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment