Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
exploratory data analysis performed on a pandas DataFrame
import numpy as np
import pandas as pd
from pandas.core.dtypes.common import (
is_numeric_dtype, is_datetime64_dtype, is_bool_dtype
)
from pandas.core.indexes.datetimes import DatetimeIndex
def eda(dataframe, head=True, info=True, describe=True, duplicated=True,
dup_kwd={}):
"""
exploratory data analysis
:param dataframe: a pandas DataFrame
:param head: boolean; if True, the first 5 rows of dataframe is shown
:param info: boolean; if True, dataframe.info() and nulls are shown
:param describe: boolean; if True, descriptions of the columns (grouped by
numeric, datetime, and other) are shown
:param duplicated: boolean; if True, info on duplicated rows are shown
:param dup_kwd: keyword arguments for find_duplicated
:return: None
"""
assert isinstance(dataframe, pd.DataFrame), \
"pandas DataFrame is required; got {} instead".format(type(dataframe))
if head:
print('Head of the dataframe:\n\n{}\n\n'.format(dataframe.head()))
# shape, index, columns, nulls, dtypes
if info:
dataframe.info()
print('\n')
show_null(dataframe)
print('\n')
if describe:
describe_by_type(dataframe)
# find duplicates
if duplicated:
find_duplicate(dataframe, **dup_kwd)
def describe_by_type(dataframe):
"""
prints descriptions of the columns (grouped by numeric, datetime, boolean,
and others) and DatetimeIndex (if any)
:param dataframe: a pandas DataFrame
:return: None
"""
boolean, numeric, datetime, other = False, False, False, False
for column in dataframe.columns:
if is_bool_dtype(dataframe[column]):
boolean = True
elif is_numeric_dtype(dataframe[column]):
numeric = True
elif is_datetime64_dtype(dataframe[column]):
datetime = True
else:
other = True
# describe datetime columns and DatetimeIndex (if any)
if isinstance(dataframe.index, DatetimeIndex):
print(pd.Series(dataframe.index).describe())
print('\n')
if datetime:
print(dataframe.describe(include=['datetime']))
print('\n')
# describe numeric columns (if any)
if numeric:
print(dataframe.describe())
print('\n')
# describe boolean columns (if any)
if boolean:
print(dataframe.describe(include=[np.bool]))
print('\n')
# describe other columns (if any)
if other:
print(dataframe.describe(exclude=[np.number, np.datetime64, np.bool]))
print('\n')
def show_null(dataframe):
"""
prints the number and percentage of null values in each column
:param dataframe: a pandas DataFrame
:return: None
"""
if dataframe.isnull().sum().sum() == 0:
print('No null in the dataframe.')
else:
print('Number of nulls in each column:\n{}\n'.format(
dataframe.isnull().sum()
))
print('Percentage of nulls in each column:\n{}\n'.format(
dataframe.isnull().sum() / len(dataframe)
))
def find_duplicate(dataframe, show=True, sort=False):
"""
prints out information on duplicate rows
:param dataframe: a pandas DataFrame
:param show: boolean; if True, the duplicated rows (if any) are shown
:param sort: boolean; if True, the duplicated rows are sorted by each column
of the dataframe
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.