ltfschoen/proportions_of_missing_data_in_dataframe_columns.py

## proportions_of_missing_data_in_dataframe_columns.py
# Author: Luke Schoen 2017
import pandas as pd
import numpy as np
import functools

# Create DataFrame
# df = pd.DataFrame(np.random.randn(10,2))

# Populate with NaN values
df = pd.DataFrame({'col1': ['1.111', '2.111', '3.111', '4.111'], 'col2': ['4.111', '5.111', np.NaN, '7.111'], 'col3': ['8', '9', np.NaN, np.NaN], 'col4': ['12', '13', '14', '15']})

# Round all values to 2 decimal places
df.apply(functools.partial(np.round, decimals=2))

# Populate DataFrame column 0 and indexed rows 2 to 6 with NaN values
df.iloc[3:6,0] = np.nan

def get_percentage_missing(series):
    """ Calculates percentage of NaN values in DataFrame
    :param series: Pandas DataFrame object
    :return: float
    """
    num = series.isnull().sum()
    den = len(series)
    return round(num/den, 2)

# Only include columns that contain any NaN values
df_with_any_null_values = df[df.columns[df.isnull().any()].tolist()]

get_percentage_missing(df_with_any_null_values)

# Show qty of each value in a Column
# df.astype(str).groupby(['col1']).sum()

# Show DataFrame
df
# df.head()

# Show DataFrame info
print(df.describe())
print(df.info())

# Iterate over columns in DataFrame and delete those with where >30% of the values are null/NaN
for name, values in df_with_any_null_values.iteritems():
    print("%r: %r" % (name, values) )
    if get_percentage_missing(df_with_any_null_values[name]) > 0.3:
        print("Deleting Column %r: " % (name) )
        df_with_any_null_values.drop(name, axis=1, inplace=True)

# Iterate over columns in DataFrame and delete rows of columns where any values are null/NaN
for name, values in df_with_any_null_values.iteritems():
    if name != "id":
        if get_percentage_missing(df_with_any_null_values[name]) < 0.01:
            print("Retained Column: %r, but removed its null and NaN valued rows" % (name) )
            print("BEFORE %r: %r" % (name, values) )
            df_with_any_null_values.dropna(axis=0, how="any", subset=[name], inplace=True)
            print("AFTER %r: %r" % (name, values) )

# Select only Columns of certain types
# http://pandas.pydata.org/pandas-docs/version/0.19.2/generated/pandas.DataFrame.select_dtypes.html
df.select_dtypes(include=['int', 'float64', 'floating', 'number'], exclude=['O'])

# Iterate over Columns and perform modifications depending on the type
# IMPORTANT NOTE: ENSURE ONLY USE AFTER REMOVE NAN VALUES
for col in df.columns:
    for name, values in df[col].iteritems():
        # print("%r, %r" % (name, values))
        if(values.dtype == np.float64 or values.dtype == np.int64):
            print("float or int type %r" % (values.dtype))
            # treat_numeric(df[name])
        elif(df[name].dtype == np.str):
            print("string type %r" % (df[name].dtype))
            #treat_str(df[y])
        elif(df[name].dtype == np.object):
            print("object type %r" % (df[name].dtype))
            #treat_object(df[name])
        else:
            print("other type %r" % (values.dtype))
	# Author: Luke Schoen 2017
	import pandas as pd
	import numpy as np
	import functools

	# Create DataFrame
	# df = pd.DataFrame(np.random.randn(10,2))

	# Populate with NaN values
	df = pd.DataFrame({'col1': ['1.111', '2.111', '3.111', '4.111'], 'col2': ['4.111', '5.111', np.NaN, '7.111'], 'col3': ['8', '9', np.NaN, np.NaN], 'col4': ['12', '13', '14', '15']})

	# Round all values to 2 decimal places
	df.apply(functools.partial(np.round, decimals=2))

	# Populate DataFrame column 0 and indexed rows 2 to 6 with NaN values
	df.iloc[3:6,0] = np.nan

	def get_percentage_missing(series):
	""" Calculates percentage of NaN values in DataFrame
	:param series: Pandas DataFrame object
	:return: float
	"""
	num = series.isnull().sum()
	den = len(series)
	return round(num/den, 2)

	# Only include columns that contain any NaN values
	df_with_any_null_values = df[df.columns[df.isnull().any()].tolist()]

	get_percentage_missing(df_with_any_null_values)

	# Show qty of each value in a Column
	# df.astype(str).groupby(['col1']).sum()

	# Show DataFrame
	df
	# df.head()

	# Show DataFrame info
	print(df.describe())
	print(df.info())

	# Iterate over columns in DataFrame and delete those with where >30% of the values are null/NaN
	for name, values in df_with_any_null_values.iteritems():
	print("%r: %r" % (name, values) )
	if get_percentage_missing(df_with_any_null_values[name]) > 0.3:
	print("Deleting Column %r: " % (name) )
	df_with_any_null_values.drop(name, axis=1, inplace=True)

	# Iterate over columns in DataFrame and delete rows of columns where any values are null/NaN
	for name, values in df_with_any_null_values.iteritems():
	if name != "id":
	if get_percentage_missing(df_with_any_null_values[name]) < 0.01:
	print("Retained Column: %r, but removed its null and NaN valued rows" % (name) )
	print("BEFORE %r: %r" % (name, values) )
	df_with_any_null_values.dropna(axis=0, how="any", subset=[name], inplace=True)
	print("AFTER %r: %r" % (name, values) )

	# Select only Columns of certain types
	# http://pandas.pydata.org/pandas-docs/version/0.19.2/generated/pandas.DataFrame.select_dtypes.html
	df.select_dtypes(include=['int', 'float64', 'floating', 'number'], exclude=['O'])

	# Iterate over Columns and perform modifications depending on the type
	# IMPORTANT NOTE: ENSURE ONLY USE AFTER REMOVE NAN VALUES
	for col in df.columns:
	for name, values in df[col].iteritems():
	# print("%r, %r" % (name, values))
	if(values.dtype == np.float64 or values.dtype == np.int64):
	print("float or int type %r" % (values.dtype))
	# treat_numeric(df[name])
	elif(df[name].dtype == np.str):
	print("string type %r" % (df[name].dtype))
	#treat_str(df[y])
	elif(df[name].dtype == np.object):
	print("object type %r" % (df[name].dtype))
	#treat_object(df[name])
	else:
	print("other type %r" % (values.dtype))