pllim/df_optimize.ipynb

## df_optimize.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Homework to auto optimize `pandas` data frame object."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Import modules needed for this notebook\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def df_optimize_basic(df, thresh=0.1):\n",
    "    \"\"\"Optimize dataframe by using categorial columns.\"\"\"\n",
    "    c_info = df.select_dtypes(include=['object']).describe()\n",
    "    ratio = c_info.loc['unique'] / c_info.loc['count']\n",
    "    for iname in ratio[ratio < thresh].index:\n",
    "        df[iname] = df[iname].astype('category')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Answer from instructor, David Mertz, *\"Here's a bit more aggressive optimize() function. It gets the beer DataFrame down to less than half its original size. I'm pretty sure a wide range of queries will be considerably faster also.\"*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def df_optimize(df, thresh=0.25):\n",
    "    \"\"\"Optimize dataframe by using categorial columns.\"\"\"\n",
    "    c_info = df.select_dtypes(include=['object']).describe()\n",
    "    ratio = c_info.loc['unique'] / c_info.loc['count']\n",
    "    for iname in ratio[ratio < thresh].index:\n",
    "        df[iname] = df[iname].astype('category')\n",
    "        \n",
    "    for col in df.columns:\n",
    "        if df[col].dtype == np.int64:\n",
    "            if df[col].min() > -2**15 and df[col].max() < 2**15:\n",
    "                df[col] = df[col].astype('int16')\n",
    "            elif df[col].min() >  -2**31 and df[col].max() < 2**31:\n",
    "                df[col] = df[col].astype('int32')\n",
    "        if df[col].dtype in (np.float64, np.float128):\n",
    "            vals = np.unique(df[col])\n",
    "            nvals = vals[np.isfinite(vals)].size\n",
    "            if nvals/len(df) < thresh:\n",
    "                df[col] = df[col].astype('category')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Modified answer to be even more aggresive."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def df_optimize_overkill(df, thresh=0.25):\n",
    "    \"\"\"Optimize dataframe by using categorial columns and\n",
    "    demoting int data types.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    df : DataFrame\n",
    "        ``pandas`` data frame. This is modified in-place.\n",
    "\n",
    "    thresh : float, optional\n",
    "        Threshold of unique to total ratio to\n",
    "        convert column to a category.\n",
    "\n",
    "    \"\"\"\n",
    "    # Convert repeating strings to category.\n",
    "    c_info = df.select_dtypes(include=['object']).describe()\n",
    "    ratio = c_info.loc['unique'] / c_info.loc['count']\n",
    "    for iname in ratio[ratio < thresh].index:\n",
    "        df[iname] = df[iname].astype('category')\n",
    "\n",
    "    ii08 = np.iinfo(np.int8)\n",
    "    ii16 = np.iinfo(np.int16)    \n",
    "    ii32 = np.iinfo(np.int32)\n",
    "\n",
    "    fi16 = np.finfo(np.float16)\n",
    "    fi32 = np.finfo(np.float32)\n",
    "    fi64 = np.finfo(np.float64)\n",
    "        \n",
    "    for col in df.columns:       \n",
    "        # Convert repeating numbers to category.\n",
    "        if df[col].dtype in (np.int8, np.int16, np.int32, np.int64,\n",
    "                             np.float16, np.float32, np.float64, np.float128):\n",
    "            vals = np.unique(df[col])\n",
    "            nvals = np.count_nonzero(np.isfinite(vals))\n",
    "            if nvals / len(df) < thresh:\n",
    "                df[col] = df[col].astype('category')\n",
    "                \n",
    "        # Demote numerical data types if still not a category.\n",
    "        # Demotion is level by level until lowest possible level is reached.\n",
    "        if df[col].dtype in (np.int16, np.int32, np.int64):\n",
    "            if df[col].dtype == np.int64:\n",
    "                if df[col].min() > ii32.min and df[col].max() < ii32.max:\n",
    "                    df[col] = df[col].astype('int32')               \n",
    "            if df[col].dtype == np.int32:\n",
    "                if df[col].min() > ii16.min and df[col].max() < ii16.max:\n",
    "                    df[col] = df[col].astype('int16')\n",
    "            else:  # int16\n",
    "                if df[col].min() > ii08.min and df[col].max() < ii08.max:\n",
    "                    df[col] = df[col].astype('int8')\n",
    "        # D. Mertz: Demoting float is a bad idea!\n",
    "        elif df[col].dtype in (np.float32, np.float64, np.float128):\n",
    "            if df[col].dtype == np.float128:\n",
    "                if df[col].min() > fi64.min and df[col].max() < fi64.max:\n",
    "                    df[col] = df[col].astype('float64')\n",
    "            if df[col].dtype == np.float64:\n",
    "                if df[col].min() > fi32.min and df[col].max() < fi32.max:\n",
    "                    df[col] = df[col].astype('float32')\n",
    "            else:  # float32\n",
    "                if df[col].min() > fi16.min and df[col].max() < fi16.max:\n",
    "                    df[col] = df[col].astype('float16')         "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 50000 entries, 0 to 49999\n",
      "Data columns (total 13 columns):\n",
      "abv                  48389 non-null float64\n",
      "beer_id              50000 non-null int64\n",
      "brewer_id            50000 non-null int64\n",
      "beer_name            50000 non-null object\n",
      "beer_style           50000 non-null object\n",
      "review_appearance    50000 non-null float64\n",
      "review_aroma         50000 non-null float64\n",
      "review_overall       50000 non-null float64\n",
      "review_palate        50000 non-null float64\n",
      "profile_name         50000 non-null object\n",
      "review_taste         50000 non-null float64\n",
      "text                 49991 non-null object\n",
      "time                 50000 non-null datetime64[ns]\n",
      "dtypes: datetime64[ns](1), float64(6), int64(2), object(4)\n",
      "memory usage: 5.3+ MB\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_csv('data/beer2.csv.gz',\n",
    "                 index_col=0,\n",
    "                 parse_dates=['time'])\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 50000 entries, 0 to 49999\n",
      "Data columns (total 13 columns):\n",
      "abv                  48389 non-null float64\n",
      "beer_id              50000 non-null int64\n",
      "brewer_id            50000 non-null int64\n",
      "beer_name            50000 non-null object\n",
      "beer_style           50000 non-null category\n",
      "review_appearance    50000 non-null float64\n",
      "review_aroma         50000 non-null float64\n",
      "review_overall       50000 non-null float64\n",
      "review_palate        50000 non-null float64\n",
      "profile_name         50000 non-null category\n",
      "review_taste         50000 non-null float64\n",
      "text                 49991 non-null object\n",
      "time                 50000 non-null datetime64[ns]\n",
      "dtypes: category(2), datetime64[ns](1), float64(6), int64(2), object(2)\n",
      "memory usage: 4.8+ MB\n"
     ]
    }
   ],
   "source": [
    "# Original homework answer.\n",
    "df2 = df.copy()\n",
    "df_optimize_basic(df2)\n",
    "df2.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 50000 entries, 0 to 49999\n",
      "Data columns (total 13 columns):\n",
      "abv                  48389 non-null category\n",
      "beer_id              50000 non-null int32\n",
      "brewer_id            50000 non-null int16\n",
      "beer_name            50000 non-null category\n",
      "beer_style           50000 non-null category\n",
      "review_appearance    50000 non-null category\n",
      "review_aroma         50000 non-null category\n",
      "review_overall       50000 non-null category\n",
      "review_palate        50000 non-null category\n",
      "profile_name         50000 non-null category\n",
      "review_taste         50000 non-null category\n",
      "text                 49991 non-null object\n",
      "time                 50000 non-null datetime64[ns]\n",
      "dtypes: category(9), datetime64[ns](1), int16(1), int32(1), object(1)\n",
      "memory usage: 2.1+ MB\n"
     ]
    }
   ],
   "source": [
    "# D. Mertz's solution.\n",
    "df2 = df.copy()\n",
    "df_optimize(df2)\n",
    "df2.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 50000 entries, 0 to 49999\n",
      "Data columns (total 13 columns):\n",
      "abv                  48389 non-null category\n",
      "beer_id              50000 non-null category\n",
      "brewer_id            50000 non-null category\n",
      "beer_name            50000 non-null category\n",
      "beer_style           50000 non-null category\n",
      "review_appearance    50000 non-null category\n",
      "review_aroma         50000 non-null category\n",
      "review_overall       50000 non-null category\n",
      "review_palate        50000 non-null category\n",
      "profile_name         50000 non-null category\n",
      "review_taste         50000 non-null category\n",
      "text                 49991 non-null object\n",
      "time                 50000 non-null datetime64[ns]\n",
      "dtypes: category(11), datetime64[ns](1), object(1)\n",
      "memory usage: 2.1+ MB\n"
     ]
    }
   ],
   "source": [
    "# Overkill solution.\n",
    "# No memory usage improvement for this test case.\n",
    "df2 = df.copy()\n",
    "df_optimize_overkill(df2)\n",
    "df2.info()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Homework to auto optimize `pandas` data frame object."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Import modules needed for this notebook\n",
	"import pandas as pd\n",
	"import numpy as np"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def df_optimize_basic(df, thresh=0.1):\n",
	" \"\"\"Optimize dataframe by using categorial columns.\"\"\"\n",
	" c_info = df.select_dtypes(include=['object']).describe()\n",
	" ratio = c_info.loc['unique'] / c_info.loc['count']\n",
	" for iname in ratio[ratio < thresh].index:\n",
	" df[iname] = df[iname].astype('category')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Answer from instructor, David Mertz, \"Here's a bit more aggressive optimize() function. It gets the beer DataFrame down to less than half its original size. I'm pretty sure a wide range of queries will be considerably faster also.\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def df_optimize(df, thresh=0.25):\n",
	" \"\"\"Optimize dataframe by using categorial columns.\"\"\"\n",
	" c_info = df.select_dtypes(include=['object']).describe()\n",
	" ratio = c_info.loc['unique'] / c_info.loc['count']\n",
	" for iname in ratio[ratio < thresh].index:\n",
	" df[iname] = df[iname].astype('category')\n",
	" \n",
	" for col in df.columns:\n",
	" if df[col].dtype == np.int64:\n",
	" if df[col].min() > -215 and df[col].max() < 215:\n",
	" df[col] = df[col].astype('int16')\n",
	" elif df[col].min() > -231 and df[col].max() < 231:\n",
	" df[col] = df[col].astype('int32')\n",
	" if df[col].dtype in (np.float64, np.float128):\n",
	" vals = np.unique(df[col])\n",
	" nvals = vals[np.isfinite(vals)].size\n",
	" if nvals/len(df) < thresh:\n",
	" df[col] = df[col].astype('category')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Modified answer to be even more aggresive."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def df_optimize_overkill(df, thresh=0.25):\n",
	" \"\"\"Optimize dataframe by using categorial columns and\n",
	" demoting int data types.\n",
	"\n",
	" Parameters\n",
	" ----------\n",
	" df : DataFrame\n",
	" ``pandas`` data frame. This is modified in-place.\n",
	"\n",
	" thresh : float, optional\n",
	" Threshold of unique to total ratio to\n",
	" convert column to a category.\n",
	"\n",
	" \"\"\"\n",
	" # Convert repeating strings to category.\n",
	" c_info = df.select_dtypes(include=['object']).describe()\n",
	" ratio = c_info.loc['unique'] / c_info.loc['count']\n",
	" for iname in ratio[ratio < thresh].index:\n",
	" df[iname] = df[iname].astype('category')\n",
	"\n",
	" ii08 = np.iinfo(np.int8)\n",
	" ii16 = np.iinfo(np.int16) \n",
	" ii32 = np.iinfo(np.int32)\n",
	"\n",
	" fi16 = np.finfo(np.float16)\n",
	" fi32 = np.finfo(np.float32)\n",
	" fi64 = np.finfo(np.float64)\n",
	" \n",
	" for col in df.columns: \n",
	" # Convert repeating numbers to category.\n",
	" if df[col].dtype in (np.int8, np.int16, np.int32, np.int64,\n",
	" np.float16, np.float32, np.float64, np.float128):\n",
	" vals = np.unique(df[col])\n",
	" nvals = np.count_nonzero(np.isfinite(vals))\n",
	" if nvals / len(df) < thresh:\n",
	" df[col] = df[col].astype('category')\n",
	" \n",
	" # Demote numerical data types if still not a category.\n",
	" # Demotion is level by level until lowest possible level is reached.\n",
	" if df[col].dtype in (np.int16, np.int32, np.int64):\n",
	" if df[col].dtype == np.int64:\n",
	" if df[col].min() > ii32.min and df[col].max() < ii32.max:\n",
	" df[col] = df[col].astype('int32') \n",
	" if df[col].dtype == np.int32:\n",
	" if df[col].min() > ii16.min and df[col].max() < ii16.max:\n",
	" df[col] = df[col].astype('int16')\n",
	" else: # int16\n",
	" if df[col].min() > ii08.min and df[col].max() < ii08.max:\n",
	" df[col] = df[col].astype('int8')\n",
	" # D. Mertz: Demoting float is a bad idea!\n",
	" elif df[col].dtype in (np.float32, np.float64, np.float128):\n",
	" if df[col].dtype == np.float128:\n",
	" if df[col].min() > fi64.min and df[col].max() < fi64.max:\n",
	" df[col] = df[col].astype('float64')\n",
	" if df[col].dtype == np.float64:\n",
	" if df[col].min() > fi32.min and df[col].max() < fi32.max:\n",
	" df[col] = df[col].astype('float32')\n",
	" else: # float32\n",
	" if df[col].min() > fi16.min and df[col].max() < fi16.max:\n",
	" df[col] = df[col].astype('float16') "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"<class 'pandas.core.frame.DataFrame'>\n",
	"Int64Index: 50000 entries, 0 to 49999\n",
	"Data columns (total 13 columns):\n",
	"abv 48389 non-null float64\n",
	"beer_id 50000 non-null int64\n",
	"brewer_id 50000 non-null int64\n",
	"beer_name 50000 non-null object\n",
	"beer_style 50000 non-null object\n",
	"review_appearance 50000 non-null float64\n",
	"review_aroma 50000 non-null float64\n",
	"review_overall 50000 non-null float64\n",
	"review_palate 50000 non-null float64\n",
	"profile_name 50000 non-null object\n",
	"review_taste 50000 non-null float64\n",
	"text 49991 non-null object\n",
	"time 50000 non-null datetime64[ns]\n",
	"dtypes: datetime64[ns](1), float64(6), int64(2), object(4)\n",
	"memory usage: 5.3+ MB\n"
	]
	}
	],
	"source": [
	"df = pd.read_csv('data/beer2.csv.gz',\n",
	" index_col=0,\n",
	" parse_dates=['time'])\n",
	"df.info()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"<class 'pandas.core.frame.DataFrame'>\n",
	"Int64Index: 50000 entries, 0 to 49999\n",
	"Data columns (total 13 columns):\n",
	"abv 48389 non-null float64\n",
	"beer_id 50000 non-null int64\n",
	"brewer_id 50000 non-null int64\n",
	"beer_name 50000 non-null object\n",
	"beer_style 50000 non-null category\n",
	"review_appearance 50000 non-null float64\n",
	"review_aroma 50000 non-null float64\n",
	"review_overall 50000 non-null float64\n",
	"review_palate 50000 non-null float64\n",
	"profile_name 50000 non-null category\n",
	"review_taste 50000 non-null float64\n",
	"text 49991 non-null object\n",
	"time 50000 non-null datetime64[ns]\n",
	"dtypes: category(2), datetime64[ns](1), float64(6), int64(2), object(2)\n",
	"memory usage: 4.8+ MB\n"
	]
	}
	],
	"source": [
	"# Original homework answer.\n",
	"df2 = df.copy()\n",
	"df_optimize_basic(df2)\n",
	"df2.info()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"<class 'pandas.core.frame.DataFrame'>\n",
	"Int64Index: 50000 entries, 0 to 49999\n",
	"Data columns (total 13 columns):\n",
	"abv 48389 non-null category\n",
	"beer_id 50000 non-null int32\n",
	"brewer_id 50000 non-null int16\n",
	"beer_name 50000 non-null category\n",
	"beer_style 50000 non-null category\n",
	"review_appearance 50000 non-null category\n",
	"review_aroma 50000 non-null category\n",
	"review_overall 50000 non-null category\n",
	"review_palate 50000 non-null category\n",
	"profile_name 50000 non-null category\n",
	"review_taste 50000 non-null category\n",
	"text 49991 non-null object\n",
	"time 50000 non-null datetime64[ns]\n",
	"dtypes: category(9), datetime64[ns](1), int16(1), int32(1), object(1)\n",
	"memory usage: 2.1+ MB\n"
	]
	}
	],
	"source": [
	"# D. Mertz's solution.\n",
	"df2 = df.copy()\n",
	"df_optimize(df2)\n",
	"df2.info()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"<class 'pandas.core.frame.DataFrame'>\n",
	"Int64Index: 50000 entries, 0 to 49999\n",
	"Data columns (total 13 columns):\n",
	"abv 48389 non-null category\n",
	"beer_id 50000 non-null category\n",
	"brewer_id 50000 non-null category\n",
	"beer_name 50000 non-null category\n",
	"beer_style 50000 non-null category\n",
	"review_appearance 50000 non-null category\n",
	"review_aroma 50000 non-null category\n",
	"review_overall 50000 non-null category\n",
	"review_palate 50000 non-null category\n",
	"profile_name 50000 non-null category\n",
	"review_taste 50000 non-null category\n",
	"text 49991 non-null object\n",
	"time 50000 non-null datetime64[ns]\n",
	"dtypes: category(11), datetime64[ns](1), object(1)\n",
	"memory usage: 2.1+ MB\n"
	]
	}
	],
	"source": [
	"# Overkill solution.\n",
	"# No memory usage improvement for this test case.\n",
	"df2 = df.copy()\n",
	"df_optimize_overkill(df2)\n",
	"df2.info()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}