Created
September 8, 2017 00:21
-
-
Save Ailuropoda1864/5a067b50406534eb25ff268d4232efc8 to your computer and use it in GitHub Desktop.
exploratory data analysis performed on a pandas DataFrame
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from pandas.core.dtypes.common import ( | |
is_numeric_dtype, is_datetime64_dtype, is_bool_dtype | |
) | |
from pandas.core.indexes.datetimes import DatetimeIndex | |
def eda(dataframe, head=True, info=True, describe=True, duplicated=True, | |
dup_kwd={}): | |
""" | |
exploratory data analysis | |
:param dataframe: a pandas DataFrame | |
:param head: boolean; if True, the first 5 rows of dataframe is shown | |
:param info: boolean; if True, dataframe.info() and nulls are shown | |
:param describe: boolean; if True, descriptions of the columns (grouped by | |
numeric, datetime, and other) are shown | |
:param duplicated: boolean; if True, info on duplicated rows are shown | |
:param dup_kwd: keyword arguments for find_duplicated | |
:return: None | |
""" | |
assert isinstance(dataframe, pd.DataFrame), \ | |
"pandas DataFrame is required; got {} instead".format(type(dataframe)) | |
if head: | |
print('Head of the dataframe:\n\n{}\n\n'.format(dataframe.head())) | |
# shape, index, columns, nulls, dtypes | |
if info: | |
dataframe.info() | |
print('\n') | |
show_null(dataframe) | |
print('\n') | |
if describe: | |
describe_by_type(dataframe) | |
# find duplicates | |
if duplicated: | |
find_duplicate(dataframe, **dup_kwd) | |
def describe_by_type(dataframe): | |
""" | |
prints descriptions of the columns (grouped by numeric, datetime, boolean, | |
and others) and DatetimeIndex (if any) | |
:param dataframe: a pandas DataFrame | |
:return: None | |
""" | |
boolean, numeric, datetime, other = False, False, False, False | |
for column in dataframe.columns: | |
if is_bool_dtype(dataframe[column]): | |
boolean = True | |
elif is_numeric_dtype(dataframe[column]): | |
numeric = True | |
elif is_datetime64_dtype(dataframe[column]): | |
datetime = True | |
else: | |
other = True | |
# describe datetime columns and DatetimeIndex (if any) | |
if isinstance(dataframe.index, DatetimeIndex): | |
print(pd.Series(dataframe.index).describe()) | |
print('\n') | |
if datetime: | |
print(dataframe.describe(include=['datetime'])) | |
print('\n') | |
# describe numeric columns (if any) | |
if numeric: | |
print(dataframe.describe()) | |
print('\n') | |
# describe boolean columns (if any) | |
if boolean: | |
print(dataframe.describe(include=[np.bool])) | |
print('\n') | |
# describe other columns (if any) | |
if other: | |
print(dataframe.describe(exclude=[np.number, np.datetime64, np.bool])) | |
print('\n') | |
def show_null(dataframe): | |
""" | |
prints the number and percentage of null values in each column | |
:param dataframe: a pandas DataFrame | |
:return: None | |
""" | |
if dataframe.isnull().sum().sum() == 0: | |
print('No null in the dataframe.') | |
else: | |
print('Number of nulls in each column:\n{}\n'.format( | |
dataframe.isnull().sum() | |
)) | |
print('Percentage of nulls in each column:\n{}\n'.format( | |
dataframe.isnull().sum() / len(dataframe) | |
)) | |
def find_duplicate(dataframe, show=True, sort=False): | |
""" | |
prints out information on duplicate rows | |
:param dataframe: a pandas DataFrame | |
:param show: boolean; if True, the duplicated rows (if any) are shown | |
:param sort: boolean; if True, the duplicated rows are sorted by each column | |
of the dataframe |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment