Skip to content

Instantly share code, notes, and snippets.

@ivankeller
Last active March 12, 2023 18:36
Show Gist options
  • Save ivankeller/5805b57acf5216b07902b1d5046a37c1 to your computer and use it in GitHub Desktop.
Save ivankeller/5805b57acf5216b07902b1d5046a37c1 to your computer and use it in GitHub Desktop.
describe dataframe columns
import pandas as pd
import numpy as np
def describe_df(df, cols=None, max_distinct=10):
"""Describe columns of given dataframe.
Return a dataframe with column name, type, nb of distinct values and nb of missing values
Missing values are counted as one distinct value.
Issue: if a column contains integer and missing values, pandas cast it in float and add .0 to the value.
This misleads type column and distinct values.
Parameters
----------
df : pandas dataframe
cols : iterable, {str}, optional
list of column names, if None (default) consider all columns
Returns
-------
df : dataframe
dataframe with descriptive figures for each columns.
"""
if cols is None:
cols = df.columns
else:
unknown_cols = set(cols) - set(df.columns)
# presence of unknown columns
if len(unknown_cols) != 0:
print("Unkown columns:", unknown_cols)
return
typ = []
nb_missing = []
nb_distinct = []
distinct_values = []
for col in cols:
# type
typ.append(df[col].dtype)
# distinct values
distincts = df[col].unique()
nb_dist = len(distincts)
nb_distinct.append(nb_dist)
if nb_dist <= max_distinct:
distinct_values.append(distincts)
else:
distinct_values.append(None)
# missing values
#import pdb; pdb.set_trace()
if df[col].dtype in ['object', 'str']:
nb_missing.append((df[col] == '').sum() + (df[col].isnull()).sum())
else:
nb_missing.append(df[col].isnull().sum())
result = pd.DataFrame({'column': cols, 'type': typ, 'nb_missing': nb_missing, 'nb_distinct': nb_distinct, 'distinct_values': distinct_values})
return result[['column', 'type', 'nb_missing', 'nb_distinct', 'distinct_values']]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment