Created
January 10, 2022 10:41
-
-
Save chetanambi/8f82a4a4db02280311be8f48416d7d52 to your computer and use it in GitHub Desktop.
How to handle large datasets in Python with Pandas
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"VendorID int64\n", | |
"tpep_pickup_datetime object\n", | |
"tpep_dropoff_datetime object\n", | |
"passenger_count int64\n", | |
"trip_distance float64\n", | |
"pickup_longitude float64\n", | |
"pickup_latitude float64\n", | |
"RatecodeID int64\n", | |
"store_and_fwd_flag object\n", | |
"dropoff_longitude float64\n", | |
"dropoff_latitude float64\n", | |
"payment_type int64\n", | |
"fare_amount float64\n", | |
"extra float64\n", | |
"mta_tax float64\n", | |
"tip_amount float64\n", | |
"tolls_amount float64\n", | |
"improvement_surcharge float64\n", | |
"total_amount float64\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import pandas as pd\n", | |
"df = pd.read_csv('yellow_tripdata_2016-01.csv')\n", | |
"df.dtypes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1.5439861416816711" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.memory_usage().sum()/(1024*1024*1024)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"83.21279907226562" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df['store_and_fwd_flag'].memory_usage()/(1024*1024)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Use efficient data types" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"\n", | |
"def reduce_mem_usage(df):\n", | |
" start_mem = df.memory_usage().sum() / 1024**3\n", | |
" print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))\n", | |
" \n", | |
" for col in df.columns:\n", | |
" col_type = df[col].dtype\n", | |
" \n", | |
" if col_type != object:\n", | |
" c_min = df[col].min()\n", | |
" c_max = df[col].max()\n", | |
" if str(col_type)[:3] == 'int':\n", | |
" if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n", | |
" df[col] = df[col].astype(np.int8)\n", | |
" elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n", | |
" df[col] = df[col].astype(np.int16)\n", | |
" elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n", | |
" df[col] = df[col].astype(np.int32)\n", | |
" elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n", | |
" df[col] = df[col].astype(np.int64) \n", | |
" else:\n", | |
" if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n", | |
" df[col] = df[col].astype(np.float16)\n", | |
" elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n", | |
" df[col] = df[col].astype(np.float32)\n", | |
" else:\n", | |
" df[col] = df[col].astype(np.float64)\n", | |
" else:\n", | |
" df[col] = df[col].astype('category')\n", | |
"\n", | |
" end_mem = df.memory_usage().sum() / 1024**3\n", | |
" print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))\n", | |
" print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))\n", | |
" \n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Memory usage of dataframe is 1.54 MB\n", | |
"Memory usage after optimization is: 0.63 MB\n", | |
"Decreased by 59.3%\n" | |
] | |
} | |
], | |
"source": [ | |
"df_new = reduce_mem_usage(df)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"VendorID int8\n", | |
"tpep_pickup_datetime category\n", | |
"tpep_dropoff_datetime category\n", | |
"passenger_count int8\n", | |
"trip_distance float32\n", | |
"pickup_longitude float16\n", | |
"pickup_latitude float16\n", | |
"RatecodeID int8\n", | |
"store_and_fwd_flag category\n", | |
"dropoff_longitude float16\n", | |
"dropoff_latitude float16\n", | |
"payment_type int8\n", | |
"fare_amount float32\n", | |
"extra float16\n", | |
"mta_tax float16\n", | |
"tip_amount float16\n", | |
"tolls_amount float16\n", | |
"improvement_surcharge float16\n", | |
"total_amount float32\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_new.dtypes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"10.401758193969727" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_new['store_and_fwd_flag'].memory_usage()/(1024*1024)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Chunking" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fare_amount_max = []\n", | |
"\n", | |
"for reader in pd.read_csv('yellow_tripdata_2016-01.csv', chunksize=100000):\n", | |
" # do any processing on reader\n", | |
" fare_amount_max.append(reader['fare_amount'].max())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"111270.85" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"max(fare_amount_max)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment