Skip to content

Instantly share code, notes, and snippets.

@chetanambi
Created January 10, 2022 10:41
Show Gist options
  • Save chetanambi/8f82a4a4db02280311be8f48416d7d52 to your computer and use it in GitHub Desktop.
Save chetanambi/8f82a4a4db02280311be8f48416d7d52 to your computer and use it in GitHub Desktop.
How to handle large datasets in Python with Pandas
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"VendorID int64\n",
"tpep_pickup_datetime object\n",
"tpep_dropoff_datetime object\n",
"passenger_count int64\n",
"trip_distance float64\n",
"pickup_longitude float64\n",
"pickup_latitude float64\n",
"RatecodeID int64\n",
"store_and_fwd_flag object\n",
"dropoff_longitude float64\n",
"dropoff_latitude float64\n",
"payment_type int64\n",
"fare_amount float64\n",
"extra float64\n",
"mta_tax float64\n",
"tip_amount float64\n",
"tolls_amount float64\n",
"improvement_surcharge float64\n",
"total_amount float64\n",
"dtype: object"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"df = pd.read_csv('yellow_tripdata_2016-01.csv')\n",
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.5439861416816711"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.memory_usage().sum()/(1024*1024*1024)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"83.21279907226562"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['store_and_fwd_flag'].memory_usage()/(1024*1024)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Use efficient data types"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"def reduce_mem_usage(df):\n",
" start_mem = df.memory_usage().sum() / 1024**3\n",
" print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))\n",
" \n",
" for col in df.columns:\n",
" col_type = df[col].dtype\n",
" \n",
" if col_type != object:\n",
" c_min = df[col].min()\n",
" c_max = df[col].max()\n",
" if str(col_type)[:3] == 'int':\n",
" if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
" df[col] = df[col].astype(np.int8)\n",
" elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
" df[col] = df[col].astype(np.int16)\n",
" elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
" df[col] = df[col].astype(np.int32)\n",
" elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
" df[col] = df[col].astype(np.int64) \n",
" else:\n",
" if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
" df[col] = df[col].astype(np.float16)\n",
" elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
" df[col] = df[col].astype(np.float32)\n",
" else:\n",
" df[col] = df[col].astype(np.float64)\n",
" else:\n",
" df[col] = df[col].astype('category')\n",
"\n",
" end_mem = df.memory_usage().sum() / 1024**3\n",
" print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))\n",
" print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))\n",
" \n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Memory usage of dataframe is 1.54 MB\n",
"Memory usage after optimization is: 0.63 MB\n",
"Decreased by 59.3%\n"
]
}
],
"source": [
"df_new = reduce_mem_usage(df)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"VendorID int8\n",
"tpep_pickup_datetime category\n",
"tpep_dropoff_datetime category\n",
"passenger_count int8\n",
"trip_distance float32\n",
"pickup_longitude float16\n",
"pickup_latitude float16\n",
"RatecodeID int8\n",
"store_and_fwd_flag category\n",
"dropoff_longitude float16\n",
"dropoff_latitude float16\n",
"payment_type int8\n",
"fare_amount float32\n",
"extra float16\n",
"mta_tax float16\n",
"tip_amount float16\n",
"tolls_amount float16\n",
"improvement_surcharge float16\n",
"total_amount float32\n",
"dtype: object"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_new.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"10.401758193969727"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_new['store_and_fwd_flag'].memory_usage()/(1024*1024)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Chunking"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"fare_amount_max = []\n",
"\n",
"for reader in pd.read_csv('yellow_tripdata_2016-01.csv', chunksize=100000):\n",
" # do any processing on reader\n",
" fare_amount_max.append(reader['fare_amount'].max())"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"111270.85"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"max(fare_amount_max)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment