yohann84L/reduce_mem_usage.py

## reduce_mem_usage.py
def reduce_mem_usage(props):
    """
    Code seen on https://www.kaggle.com/jesucristo/fraud-complete-eda
    to reduce memory usage of a dataframe by using the write
    dtype for each variable.

    /!\ Be careful with it's usage, dat could be broken after /!\

    NaN value are replaced by -1
    """
    from IPython import get_ipython
    from os import name, system
    is_running_from_nb = get_ipython() is not None

    if name == 'nt':
        clear_method = 'cls'
    else:
        clear_method = 'clear'

    def _update_tab(table, headers, new_value, from_ipython, clear):
        if from_ipython:
            clear_output(wait=True)
            table.append(new_value)
            print(tabulate(table, headers))
        else:
            system(clear_method)
            table.append(new_value)
            print(tabulate(table, headers))
        return table, headers

    start_mem_usg = props.memory_usage().sum() / 1024**2

    header = ['Column', 'dtype before', 'dtype after']
    table = []

    NAlist = [] # Keeps track of columns that have missing values filled in.
    for col in props.columns:
        if not props[col].dtype.name in ['object', 'category']:  # Exclude strings
            dtype_before = props[col].dtype

            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all():
                NAlist.append(col)
                props[col] = props[col].fillna(-1)

            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()

            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True


            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)

            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)

            _update_tab(table, header, [col, dtype_before, props[col].dtype], is_running_from_nb, clear_method)

    # Print final result
    print()
    print("___MEMORY USAGE BEFORE COMPLETION:___")
    print("Memory usage is: ",start_mem_usg," MB")
    print()
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist
	def reduce_mem_usage(props):
	"""
	Code seen on https://www.kaggle.com/jesucristo/fraud-complete-eda
	to reduce memory usage of a dataframe by using the write
	dtype for each variable.

	/!\ Be careful with it's usage, dat could be broken after /!\

	NaN value are replaced by -1
	"""
	from IPython import get_ipython
	from os import name, system
	is_running_from_nb = get_ipython() is not None

	if name == 'nt':
	clear_method = 'cls'
	else:
	clear_method = 'clear'

	def _update_tab(table, headers, new_value, from_ipython, clear):
	if from_ipython:
	clear_output(wait=True)
	table.append(new_value)
	print(tabulate(table, headers))
	else:
	system(clear_method)
	table.append(new_value)
	print(tabulate(table, headers))
	return table, headers

	start_mem_usg = props.memory_usage().sum() / 1024**2

	header = ['Column', 'dtype before', 'dtype after']
	table = []

	NAlist = [] # Keeps track of columns that have missing values filled in.
	for col in props.columns:
	if not props[col].dtype.name in ['object', 'category']: # Exclude strings
	dtype_before = props[col].dtype

	# Integer does not support NA, therefore, NA needs to be filled
	if not np.isfinite(props[col]).all():
	NAlist.append(col)
	props[col] = props[col].fillna(-1)

	# make variables for Int, max and min
	IsInt = False
	mx = props[col].max()
	mn = props[col].min()

	# test if column can be converted to an integer
	asint = props[col].fillna(0).astype(np.int64)
	result = (props[col] - asint)
	result = result.sum()
	if result > -0.01 and result < 0.01:
	IsInt = True


	# Make Integer/unsigned Integer datatypes
	if IsInt:
	if mn >= 0:
	if mx < 255:
	props[col] = props[col].astype(np.uint8)
	elif mx < 65535:
	props[col] = props[col].astype(np.uint16)
	elif mx < 4294967295:
	props[col] = props[col].astype(np.uint32)
	else:
	props[col] = props[col].astype(np.uint64)
	else:
	if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
	props[col] = props[col].astype(np.int8)
	elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
	props[col] = props[col].astype(np.int16)
	elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
	props[col] = props[col].astype(np.int32)
	elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
	props[col] = props[col].astype(np.int64)

	# Make float datatypes 32 bit
	else:
	props[col] = props[col].astype(np.float32)

	_update_tab(table, header, [col, dtype_before, props[col].dtype], is_running_from_nb, clear_method)

	# Print final result
	print()
	print("___MEMORY USAGE BEFORE COMPLETION:___")
	print("Memory usage is: ",start_mem_usg," MB")
	print()
	print("___MEMORY USAGE AFTER COMPLETION:___")
	mem_usg = props.memory_usage().sum() / 1024**2
	print("Memory usage is: ",mem_usg," MB")
	print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
	return props, NAlist