Skip to content

Instantly share code, notes, and snippets.

@ksv-muralidhar
Created February 16, 2021 03:47
Show Gist options
  • Save ksv-muralidhar/ddc07a2c884912395dfe2cfce2a6c2ec to your computer and use it in GitHub Desktop.
Save ksv-muralidhar/ddc07a2c884912395dfe2cfce2a6c2ec to your computer and use it in GitHub Desktop.
EDA
from sklearn.datasets import load_diabetes
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# LOADING DIABETES DATA (INPUT FEATURES) AND STORING IT IN A DATA FRAME
data = pd.DataFrame(load_diabetes()["data"],columns=load_diabetes()["feature_names"])
#ADDING TARGET VARIABLE TO THE DATA FRAME
data["target"] = load_diabetes()["target"]
numeric_features = list(data.select_dtypes("float64").columns)
numeric_features.remove('target')
categorical_features = list(data.select_dtypes("int8").columns)
target = "target"
print(f'numeric_features:\n{numeric_features}\n\ncategorical_features:\n{categorical_features}\n\ntarget:\n{target}')
np.round(data.isnull().mean() * 100,1)
for i in categorical_features:
print(f'{i}\n{np.round((data[i].value_counts() / data[i].value_counts().sum()) * 100,2)}')
fig,ax = plt.subplots(3,3,figsize=(10,10))
row = col = 0
for n,i in enumerate(numeric_features):
if (n%3 == 0) & (n > 0):
row += 1
col = 0
data[i].plot(kind="kde",ax=ax[row,col])
ax[row,col].set_title(i)
col += 1
from scipy.stats import normaltest
for i in numeric_features:
print(f'{i}: {"Not Gaussian" if normaltest(data[i].values,)[1]<0.05 else "Gaussian"} {normaltest(data[i].values)}')
for i in numeric_features:
print(f'{i}: {np.abs(np.round((data[i].std()/data[i].median()) * 100,2))}')
fig,ax = plt.subplots(1,3,figsize=(20,5))
data[target].plot(kind="hist",ax=ax[0])
data[target].plot(kind="kde",ax=ax[1])
data[target].plot(kind="box",ax=ax[2])
plt.show()
print(f'{target}: {"Not Gaussian" if normaltest(data[target].values,)[1]<0.05 else "Gaussian"} {normaltest(data[target].values)}')
sns.boxplot(x=data[categorical_features[0]],y=data[target])
fig,ax = plt.subplots(3,3,figsize=(15,10))
row = col = 0
for n,i in enumerate(numeric_features):
if (n%3 == 0) & (n > 0):
row += 1
col = 0
sns.regplot(x=i,y="target",data=data,ax=ax[row,col],ci=False)
col += 1
num_tgt = numeric_features.copy()
num_tgt.append('target')
fig = plt.figure(figsize=(8,8))
sns.heatmap(data[num_tgt].corr(method='pearson'),annot=True,fmt='.2f',mask=np.triu(data[num_tgt].corr(method='pearson')),cbar=False)
data.shape
fig = plt.figure(figsize=(8,8))
sns.heatmap(data[num_tgt].corr(method='kendall'),annot=True,fmt='.2f',mask=np.triu(data[num_tgt].corr(method='pearson')),cbar=False)
data.head()
list(data.columns)
data.dtypes
data["sex"].unique()
data.loc[data["sex"]>0,"sex"] = 1
data.loc[data["sex"]<0,"sex"] = 0
data.head()
data.describe().T
data["sex"] = data["sex"].astype(np.int8)
data.dtypes
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment