Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 5 You must be signed in to fork a gist
  • Save ltfschoen/4c5d2cf26b8be5355043273493a6b8b9 to your computer and use it in GitHub Desktop.
Save ltfschoen/4c5d2cf26b8be5355043273493a6b8b9 to your computer and use it in GitHub Desktop.
Calculate percentage of NaN values in a Pandas Dataframe for each column. Exclude columns that do not contain any NaN values
# Author: Luke Schoen 2017
import pandas as pd
import numpy as np
import functools
# Create DataFrame
# df = pd.DataFrame(np.random.randn(10,2))
# Populate with NaN values
df = pd.DataFrame({'col1': ['1.111', '2.111', '3.111', '4.111'], 'col2': ['4.111', '5.111', np.NaN, '7.111'], 'col3': ['8', '9', np.NaN, np.NaN], 'col4': ['12', '13', '14', '15']})
# Round all values to 2 decimal places
df.apply(functools.partial(np.round, decimals=2))
# Populate DataFrame column 0 and indexed rows 2 to 6 with NaN values
df.iloc[3:6,0] = np.nan
def get_percentage_missing(series):
""" Calculates percentage of NaN values in DataFrame
:param series: Pandas DataFrame object
:return: float
"""
num = series.isnull().sum()
den = len(series)
return round(num/den, 2)
# Only include columns that contain any NaN values
df_with_any_null_values = df[df.columns[df.isnull().any()].tolist()]
get_percentage_missing(df_with_any_null_values)
# Show qty of each value in a Column
# df.astype(str).groupby(['col1']).sum()
# Show DataFrame
df
# df.head()
# Show DataFrame info
print(df.describe())
print(df.info())
# Iterate over columns in DataFrame and delete those with where >30% of the values are null/NaN
for name, values in df_with_any_null_values.iteritems():
print("%r: %r" % (name, values) )
if get_percentage_missing(df_with_any_null_values[name]) > 0.3:
print("Deleting Column %r: " % (name) )
df_with_any_null_values.drop(name, axis=1, inplace=True)
# Iterate over columns in DataFrame and delete rows of columns where any values are null/NaN
for name, values in df_with_any_null_values.iteritems():
if name != "id":
if get_percentage_missing(df_with_any_null_values[name]) < 0.01:
print("Retained Column: %r, but removed its null and NaN valued rows" % (name) )
print("BEFORE %r: %r" % (name, values) )
df_with_any_null_values.dropna(axis=0, how="any", subset=[name], inplace=True)
print("AFTER %r: %r" % (name, values) )
# Select only Columns of certain types
# http://pandas.pydata.org/pandas-docs/version/0.19.2/generated/pandas.DataFrame.select_dtypes.html
df.select_dtypes(include=['int', 'float64', 'floating', 'number'], exclude=['O'])
# Iterate over Columns and perform modifications depending on the type
# IMPORTANT NOTE: ENSURE ONLY USE AFTER REMOVE NAN VALUES
for col in df.columns:
for name, values in df[col].iteritems():
# print("%r, %r" % (name, values))
if(values.dtype == np.float64 or values.dtype == np.int64):
print("float or int type %r" % (values.dtype))
# treat_numeric(df[name])
elif(df[name].dtype == np.str):
print("string type %r" % (df[name].dtype))
#treat_str(df[y])
elif(df[name].dtype == np.object):
print("object type %r" % (df[name].dtype))
#treat_object(df[name])
else:
print("other type %r" % (values.dtype))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment