Skip to content

Instantly share code, notes, and snippets.

@aurelienpierre
Last active March 16, 2022 10:50
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save aurelienpierre/ab427b5bb89fdc3e091a6c48cbea7a2b to your computer and use it in GitHub Desktop.
Save aurelienpierre/ab427b5bb89fdc3e091a6c48cbea7a2b to your computer and use it in GitHub Desktop.
Perform usual types and values checks on columns of a pandas.DataFrame
# Create Dataframe with fake data
df = pd.util.testing.makeMissingDataframe()
df['index1'] = df.index # create a text column by replicating index
df['A'] = 0 # create a zero column
# Helper function
def check_df_sanity(df, verbose=False):
"""Perform usual types and values checks on columns of a pandas.DataFrame"""
for col in df:
# Types checks
is_numeric = pd.api.types.is_numeric_dtype(df[col])
is_string = pd.api.types.is_string_dtype(df[col])
is_date = pd.api.types.is_datetime64_any_dtype(df[col])
is_re = pd.api.types.is_re_compilable(col)
# NaN checks
nulls = df[col].isnull()
is_null = nulls.values.any()
nulls = nulls.sum()
# Values checks
average = 0
std = 0
has_zeros = False
zeros = 0
has_negatives = False
negatives = 0
num_elem = df[col].size
if(is_numeric):
# average ± standard deviation
average = np.mean(df[col])
std = np.std(df[col])
# zero values
zeros = (df[col] == 0)
has_zeros = zeros.any()
zeros = zeros.sum()
# negative values
negatives = (df[col] < 0)
has_negatives = negatives.any()
negatives = negatives.sum()
# Report
print("column %s :" % col)
if(is_numeric):
print("\tis numeric")
print("\thas average = (%f ± %f)" % (average, std))
if(has_zeros):
print("\thas %i zero values (%.2f %%)" % (zeros, 100. * zeros / num_elem))
if(has_negatives):
print("\thas %i negative values (%.2f %%)" % (negatives, 100. * negatives / num_elem))
if(is_string):
print("\tis string")
if(is_date):
print("\tis date")
if(is_null):
print("\thas %i missing entries (%.2f %%)" % (nulls, 100. * nulls / num_elem))
if(verbose):
# less used/useful checks go here
if(is_re):
print("\thas a title that can be used in regex")
# test
check_df_sanity(df)
# example output
"""
column A :
is numeric
has average = (0.000000 ± 0.000000)
has 30 zero values (100.00 %)
column B :
is numeric
has average = (0.030225 ± 1.107637)
has 13 negative values (43.33 %)
has 6 missing entries (20.00 %)
column C :
is numeric
has average = (-0.169237 ± 0.999756)
has 14 negative values (46.67 %)
has 3 missing entries (10.00 %)
column D :
is numeric
has average = (-0.141427 ± 1.015646)
has 18 negative values (60.00 %)
column index1 :
is string
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment