Skip to content

Instantly share code, notes, and snippets.

@dthboyd
Forked from jiahao87/exploratory_data_analysis.py
Created September 27, 2019 12:44
Show Gist options
  • Save dthboyd/0f8e64a449b3788f41465c5b926d0e06 to your computer and use it in GitHub Desktop.
Save dthboyd/0f8e64a449b3788f41465c5b926d0e06 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import missingno
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
def time_series_plot(df):
"""Given dataframe, generate times series plot of numeric data by daily, monthly and yearly frequency"""
print("\nTo check time series of numeric data by daily, monthly and yearly frequency")
if len(df.select_dtypes(include='datetime64').columns)>0:
for col in df.select_dtypes(include='datetime64').columns:
for p in ['D', 'M', 'Y']:
if p=='D':
print("Plotting daily data")
elif p=='M':
print("Plotting monthly data")
else:
print("Plotting yearly data")
for col_num in df.select_dtypes(include=np.number).columns:
__ = df.copy()
__ = __.set_index(col)
__T = __.resample(p).sum()
ax = __T[[col_num]].plot()
ax.set_ylim(bottom=0)
ax.get_yaxis().set_major_formatter(
matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.show()
def numeric_eda(df, hue=None):
"""Given dataframe, generate EDA of numeric data"""
print("\nTo check: \nDistribution of numeric data")
display(df.describe().T)
columns = df.select_dtypes(include=np.number).columns
figure = plt.figure(figsize=(20, 10))
figure.add_subplot(1, len(columns), 1)
for index, col in enumerate(columns):
if index > 0:
figure.add_subplot(1, len(columns), index + 1)
sns.boxplot(y=col, data=df, boxprops={'facecolor': 'None'})
figure.tight_layout()
plt.show()
if len(df.select_dtypes(include='category').columns) > 0:
for col_num in df.select_dtypes(include=np.number).columns:
for col in df.select_dtypes(include='category').columns:
fig = sns.catplot(x=col, y=col_num, kind='violin', data=df, height=5, aspect=2)
fig.set_xticklabels(rotation=90)
plt.show()
# Plot the pairwise joint distributions
print("\nTo check pairwise joint distribution of numeric data")
if hue==None:
sns.pairplot(df.select_dtypes(include=np.number))
else:
sns.pairplot(df.select_dtypes(include=np.number).join(df[[hue]]), hue=hue)
plt.show()
def top5(df):
"""Given dataframe, generate top 5 unique values for non-numeric data"""
columns = df.select_dtypes(include=['object', 'category']).columns
for col in columns:
print("Top 5 unique values of " + col)
print(df[col].value_counts().reset_index().rename(columns={"index": col, col: "Count"})[
:min(5, len(df[col].value_counts()))])
print(" ")
def categorical_eda(df, hue=None):
"""Given dataframe, generate EDA of categorical data"""
print("\nTo check: \nUnique count of non-numeric data\n")
print(df.select_dtypes(include=['object', 'category']).nunique())
top5(df)
# Plot count distribution of categorical data
for col in df.select_dtypes(include='category').columns:
fig = sns.catplot(x=col, kind="count", data=df, hue=hue)
fig.set_xticklabels(rotation=90)
plt.show()
def eda(df):
"""Given dataframe, generate exploratory data analysis"""
# check that input is pandas dataframe
if type(df) != pd.core.frame.DataFrame:
raise TypeError("Only pandas dataframe is allowed as input")
# replace field that's entirely space (or empty) with NaN
df = df.replace(r'^\s*$', np.nan, regex=True)
print("Preview of data:")
display(df.head(3))
print("\nTo check: \n (1) Total number of entries \n (2) Column types \n (3) Any null values\n")
print(df.info())
# generate preview of entries with null values
if len(df[df.isnull().any(axis=1)] != 0):
print("\nPreview of data with null values:")
display(df[df.isnull().any(axis=1)].head(3))
missingno.matrix(df)
plt.show()
# generate count statistics of duplicate entries
if len(df[df.duplicated()]) > 0:
print("\n***Number of duplicated entries: ", len(df[df.duplicated()]))
display(df[df.duplicated(keep=False)].sort_values(by=list(df.columns)).head())
else:
print("\nNo duplicated entries found")
# EDA of categorical data
categorical_eda(df)
# EDA of numeric data
numeric_eda(df)
# Plot time series plot of numeric data
time_series_plot(df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment