Skip to content

Instantly share code, notes, and snippets.

@Ailuropoda1864
Created September 8, 2017 00:21
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save Ailuropoda1864/5a067b50406534eb25ff268d4232efc8 to your computer and use it in GitHub Desktop.
Save Ailuropoda1864/5a067b50406534eb25ff268d4232efc8 to your computer and use it in GitHub Desktop.
exploratory data analysis performed on a pandas DataFrame
import numpy as np
import pandas as pd
from pandas.core.dtypes.common import (
is_numeric_dtype, is_datetime64_dtype, is_bool_dtype
)
from pandas.core.indexes.datetimes import DatetimeIndex
def eda(dataframe, head=True, info=True, describe=True, duplicated=True,
dup_kwd={}):
"""
exploratory data analysis
:param dataframe: a pandas DataFrame
:param head: boolean; if True, the first 5 rows of dataframe is shown
:param info: boolean; if True, dataframe.info() and nulls are shown
:param describe: boolean; if True, descriptions of the columns (grouped by
numeric, datetime, and other) are shown
:param duplicated: boolean; if True, info on duplicated rows are shown
:param dup_kwd: keyword arguments for find_duplicated
:return: None
"""
assert isinstance(dataframe, pd.DataFrame), \
"pandas DataFrame is required; got {} instead".format(type(dataframe))
if head:
print('Head of the dataframe:\n\n{}\n\n'.format(dataframe.head()))
# shape, index, columns, nulls, dtypes
if info:
dataframe.info()
print('\n')
show_null(dataframe)
print('\n')
if describe:
describe_by_type(dataframe)
# find duplicates
if duplicated:
find_duplicate(dataframe, **dup_kwd)
def describe_by_type(dataframe):
"""
prints descriptions of the columns (grouped by numeric, datetime, boolean,
and others) and DatetimeIndex (if any)
:param dataframe: a pandas DataFrame
:return: None
"""
boolean, numeric, datetime, other = False, False, False, False
for column in dataframe.columns:
if is_bool_dtype(dataframe[column]):
boolean = True
elif is_numeric_dtype(dataframe[column]):
numeric = True
elif is_datetime64_dtype(dataframe[column]):
datetime = True
else:
other = True
# describe datetime columns and DatetimeIndex (if any)
if isinstance(dataframe.index, DatetimeIndex):
print(pd.Series(dataframe.index).describe())
print('\n')
if datetime:
print(dataframe.describe(include=['datetime']))
print('\n')
# describe numeric columns (if any)
if numeric:
print(dataframe.describe())
print('\n')
# describe boolean columns (if any)
if boolean:
print(dataframe.describe(include=[np.bool]))
print('\n')
# describe other columns (if any)
if other:
print(dataframe.describe(exclude=[np.number, np.datetime64, np.bool]))
print('\n')
def show_null(dataframe):
"""
prints the number and percentage of null values in each column
:param dataframe: a pandas DataFrame
:return: None
"""
if dataframe.isnull().sum().sum() == 0:
print('No null in the dataframe.')
else:
print('Number of nulls in each column:\n{}\n'.format(
dataframe.isnull().sum()
))
print('Percentage of nulls in each column:\n{}\n'.format(
dataframe.isnull().sum() / len(dataframe)
))
def find_duplicate(dataframe, show=True, sort=False):
"""
prints out information on duplicate rows
:param dataframe: a pandas DataFrame
:param show: boolean; if True, the duplicated rows (if any) are shown
:param sort: boolean; if True, the duplicated rows are sorted by each column
of the dataframe
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment