Skip to content

Instantly share code, notes, and snippets.

@MaxPowerWasTaken
Last active August 1, 2017 05:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MaxPowerWasTaken/1f4bdb6e3ff3d5491221e3a7c748cf1f to your computer and use it in GitHub Desktop.
Save MaxPowerWasTaken/1f4bdb6e3ff3d5491221e3a7c748cf1f to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
def check_var_size_requirements(df):
''' for integer columns, display smallest int type which could safely store values'''
# Iterate through int columns
# (pandas' read_* is good about assigning some int type if all vals are ints and no NaNs)
int_types = ['int', np.int8, np.int16, np.int32, np.int64,
np.uint8, np.uint16, np.uint32, np.uint64]
df = df.select_dtypes(include=int_types)
for col_name in df.columns:
# find max / min col values
col_min = df[col_name].min()
col_max = df[col_name].max()
# decide smallest int datatype which can hold max/min values above
if col_min >= 0:
# (can use unsigned ints)
use_type = ('uint8' if col_max < np.iinfo(np.uint8).max else
'uint16' if col_max < np.iinfo(np.uint16).max else
'uint32' if col_max < np.iinfo(np.uint32).max else
'uint64')
else:
# (have negative numbers, need signed integer types)
use_type = ('int8' if col_max < np.iinfo(np.int8).max and col_min > np.iinfo(np.int8).min else
'int16' if col_max < np.iinfo(np.int16).max and col_min > np.iinfo(np.int16).min else
'int32' if col_max < np.iinfo(np.int32).max and col_min > np.iinfo(np.int32).min else
'int64')
# (For now I prefer this to be informational, instead of automatically recasting types.)
print("{}\ncurrent type {}, could fit safely in type {} \n".format(col_name,
df[col_name].dtype,
use_type))
########################
# EXAMPLE / TEST
########################
#test_df = pd.DataFrame({'a': [100],
# 'b': [1000],
# 'c': [10000],
# 'd': [100000],
# 'e': [-150],
# 'f': [100]})
#
#test_df['f'] = test_df['f'].astype(np.int32)
#
#check_var_size_requirements(test_df)
# OUTPUT:
#a
#current type int64, could fit safely in type uint8
#
#b
#current type int64, could fit safely in type uint16
#
#c
#current type int64, could fit safely in type uint16
#
#d
#current type int64, could fit safely in type uint32
#
#e
#current type int64, could fit safely in type int16
#
#f
#current type int32, could fit safely in type uint8
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment