chetanambi/handle large datasets in Python.ipynb

## handle large datasets in Python.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "VendorID                   int64\n",
       "tpep_pickup_datetime      object\n",
       "tpep_dropoff_datetime     object\n",
       "passenger_count            int64\n",
       "trip_distance            float64\n",
       "pickup_longitude         float64\n",
       "pickup_latitude          float64\n",
       "RatecodeID                 int64\n",
       "store_and_fwd_flag        object\n",
       "dropoff_longitude        float64\n",
       "dropoff_latitude         float64\n",
       "payment_type               int64\n",
       "fare_amount              float64\n",
       "extra                    float64\n",
       "mta_tax                  float64\n",
       "tip_amount               float64\n",
       "tolls_amount             float64\n",
       "improvement_surcharge    float64\n",
       "total_amount             float64\n",
       "dtype: object"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_csv('yellow_tripdata_2016-01.csv')\n",
    "df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.5439861416816711"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.memory_usage().sum()/(1024*1024*1024)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "83.21279907226562"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['store_and_fwd_flag'].memory_usage()/(1024*1024)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Use efficient data types"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "def reduce_mem_usage(df):\n",
    "    start_mem = df.memory_usage().sum() / 1024**3\n",
    "    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))\n",
    "    \n",
    "    for col in df.columns:\n",
    "        col_type = df[col].dtype\n",
    "        \n",
    "        if col_type != object:\n",
    "            c_min = df[col].min()\n",
    "            c_max = df[col].max()\n",
    "            if str(col_type)[:3] == 'int':\n",
    "                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
    "                    df[col] = df[col].astype(np.int8)\n",
    "                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
    "                    df[col] = df[col].astype(np.int16)\n",
    "                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
    "                    df[col] = df[col].astype(np.int32)\n",
    "                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
    "                    df[col] = df[col].astype(np.int64)  \n",
    "            else:\n",
    "                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
    "                    df[col] = df[col].astype(np.float16)\n",
    "                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
    "                    df[col] = df[col].astype(np.float32)\n",
    "                else:\n",
    "                    df[col] = df[col].astype(np.float64)\n",
    "        else:\n",
    "            df[col] = df[col].astype('category')\n",
    "\n",
    "    end_mem = df.memory_usage().sum() / 1024**3\n",
    "    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))\n",
    "    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory usage of dataframe is 1.54 MB\n",
      "Memory usage after optimization is: 0.63 MB\n",
      "Decreased by 59.3%\n"
     ]
    }
   ],
   "source": [
    "df_new = reduce_mem_usage(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "VendorID                     int8\n",
       "tpep_pickup_datetime     category\n",
       "tpep_dropoff_datetime    category\n",
       "passenger_count              int8\n",
       "trip_distance             float32\n",
       "pickup_longitude          float16\n",
       "pickup_latitude           float16\n",
       "RatecodeID                   int8\n",
       "store_and_fwd_flag       category\n",
       "dropoff_longitude         float16\n",
       "dropoff_latitude          float16\n",
       "payment_type                 int8\n",
       "fare_amount               float32\n",
       "extra                     float16\n",
       "mta_tax                   float16\n",
       "tip_amount                float16\n",
       "tolls_amount              float16\n",
       "improvement_surcharge     float16\n",
       "total_amount              float32\n",
       "dtype: object"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_new.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10.401758193969727"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_new['store_and_fwd_flag'].memory_usage()/(1024*1024)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Chunking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "fare_amount_max = []\n",
    "\n",
    "for reader in pd.read_csv('yellow_tripdata_2016-01.csv', chunksize=100000):\n",
    "    # do any processing on reader\n",
    "    fare_amount_max.append(reader['fare_amount'].max())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "111270.85"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "max(fare_amount_max)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"VendorID int64\n",
	"tpep_pickup_datetime object\n",
	"tpep_dropoff_datetime object\n",
	"passenger_count int64\n",
	"trip_distance float64\n",
	"pickup_longitude float64\n",
	"pickup_latitude float64\n",
	"RatecodeID int64\n",
	"store_and_fwd_flag object\n",
	"dropoff_longitude float64\n",
	"dropoff_latitude float64\n",
	"payment_type int64\n",
	"fare_amount float64\n",
	"extra float64\n",
	"mta_tax float64\n",
	"tip_amount float64\n",
	"tolls_amount float64\n",
	"improvement_surcharge float64\n",
	"total_amount float64\n",
	"dtype: object"
	]
	},
	"execution_count": 1,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import pandas as pd\n",
	"df = pd.read_csv('yellow_tripdata_2016-01.csv')\n",
	"df.dtypes"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"1.5439861416816711"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df.memory_usage().sum()/(102410241024)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"83.21279907226562"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df['store_and_fwd_flag'].memory_usage()/(1024*1024)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Use efficient data types"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"\n",
	"def reduce_mem_usage(df):\n",
	" start_mem = df.memory_usage().sum() / 1024**3\n",
	" print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))\n",
	" \n",
	" for col in df.columns:\n",
	" col_type = df[col].dtype\n",
	" \n",
	" if col_type != object:\n",
	" c_min = df[col].min()\n",
	" c_max = df[col].max()\n",
	" if str(col_type)[:3] == 'int':\n",
	" if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
	" df[col] = df[col].astype(np.int8)\n",
	" elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
	" df[col] = df[col].astype(np.int16)\n",
	" elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
	" df[col] = df[col].astype(np.int32)\n",
	" elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
	" df[col] = df[col].astype(np.int64) \n",
	" else:\n",
	" if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
	" df[col] = df[col].astype(np.float16)\n",
	" elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
	" df[col] = df[col].astype(np.float32)\n",
	" else:\n",
	" df[col] = df[col].astype(np.float64)\n",
	" else:\n",
	" df[col] = df[col].astype('category')\n",
	"\n",
	" end_mem = df.memory_usage().sum() / 1024**3\n",
	" print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))\n",
	" print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))\n",
	" \n",
	" return df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Memory usage of dataframe is 1.54 MB\n",
	"Memory usage after optimization is: 0.63 MB\n",
	"Decreased by 59.3%\n"
	]
	}
	],
	"source": [
	"df_new = reduce_mem_usage(df)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"VendorID int8\n",
	"tpep_pickup_datetime category\n",
	"tpep_dropoff_datetime category\n",
	"passenger_count int8\n",
	"trip_distance float32\n",
	"pickup_longitude float16\n",
	"pickup_latitude float16\n",
	"RatecodeID int8\n",
	"store_and_fwd_flag category\n",
	"dropoff_longitude float16\n",
	"dropoff_latitude float16\n",
	"payment_type int8\n",
	"fare_amount float32\n",
	"extra float16\n",
	"mta_tax float16\n",
	"tip_amount float16\n",
	"tolls_amount float16\n",
	"improvement_surcharge float16\n",
	"total_amount float32\n",
	"dtype: object"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df_new.dtypes"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"10.401758193969727"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df_new['store_and_fwd_flag'].memory_usage()/(1024*1024)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Chunking"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"fare_amount_max = []\n",
	"\n",
	"for reader in pd.read_csv('yellow_tripdata_2016-01.csv', chunksize=100000):\n",
	" # do any processing on reader\n",
	" fare_amount_max.append(reader['fare_amount'].max())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"111270.85"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"max(fare_amount_max)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}