Skip to content

Instantly share code, notes, and snippets.

@humamfauzi
Last active September 6, 2018 12:39
Show Gist options
  • Save humamfauzi/02bdc18f9a5a89fda29aebc82e07b035 to your computer and use it in GitHub Desktop.
Save humamfauzi/02bdc18f9a5a89fda29aebc82e07b035 to your computer and use it in GitHub Desktop.
My personal data science tool
def describe_dataframe(dataframe):
# Create a dictionary which have every detected categorical value counts in a dataframe and NaN ratio
describe_df = {}
for j in dataframe.dtypes.value_counts().index:
describe_dict = {}
cols = dataframe.select_dtypes(str(j)).columns
for i in cols:
if dataframe[i].nunique() < 100:
describe_dict[str(i)] = {"total_category": dict(dataframe[i].value_counts()),
"NaN_ratio": dataframe[i].isnull().sum()/float(dataframe.shape[0])}
describe_df[str(j)] = describe_dict
print('Total', str(j), 'Classification:', len(describe_dict.keys()), "from", len(cols))
return describe_df
#------------------------------
def boundaries_check(df_train, df_test, columns):
# Checking for sample train value that lies outside test value and drop the sample
# ub stands for upper bound and lb stands for lower bound
# we should generalize the target more so it does not stuck in binary classification
lb_count = 0
ub_count = 0
for i in columns:
train = df_train[i].describe()
test = df_test[i].describe()
outside_ub = len(df_train[df_train[i] > np.max(df_test[i])])
outside_lb = len(df_train[df_train[i] < np.min(df_test[i])])
if (outside_lb != 0) | (outside_ub != 0):
ub_count = ub_count + outside_ub
lb_count = lb_count + outside_lb
print(i, "::",outside_lb, "<>", outside_ub)
df_train = df_train.drop(df_train[df_train[i] > np.max(df_test[i])].index)
df_train = df_train.drop(df_train[df_train[i] < np.min(df_test[i])].index)
print("Total", ub_count + lb_count, "UB:", ub_count, "LB:", lb_count)
print("Affected ratios: ", (ub_count + lb_count)/ float(len(app_train)))
return df_train
#--------------------------------------------
def disc2disc(df_train, df_test, column):
# describing comparison between train and test in every unique value of a column
train = dict(df_train[column].value_counts())
test = dict(df_test[column].value_counts())
ratios = {}
ratios["train"] = {}
ratios["test"] = {}
outlist = []
for i in app_train[column].unique():
if i not in app_test[column].unique():
del train[i]
for i in app_test[column].unique():
if i not in app_train[column].unique():
del test[i]
for i in test.keys():
ratios["train"][str(i)] = train[i] / float(len(df_train))
ratios["test"][str(i)] = test[i] / float(len(df_test))
plt.figure(figsize=(8, (len(test.keys()) + 10 )// 4))
plt.title(column)
sns.heatmap(pd.DataFrame(ratios), annot=True, fmt="f", linewidths=0.5, cmap="YlGnBu")
#--------------------------------------------
def cont2disc(dataframe, column, target, legend, transformation="vanilla"):
fig, ax = plt.subplots(figsize=(18, 8))
skewn = str(dataframe[column].skew())
kurto = str(dataframe[column].kurt())
ax.set_title(" Distribution" )
for i in target:
if transformation == "vanilla":
sns.distplot(dataframe[column][i])
else:
sns.distplot(transformation(dataframe[column][i]))
if transformation == "vanilla":
shown_text = " ".join(["Skew: " + str(dataframe[column][i].skew())[:4] +
" Kurtosis " + str(dataframe[column][i].kurt())[:4] + "\n"
for i in target])
else:
shown_text = " ".join(["Skew: " + str(transformation(dataframe[column][i]).skew())[:4] +
" Kurtosis " + str(transformation(dataframe[column][i]).kurt())[:4] + "\n"
for i in target])
ax.text(0.15, 0.8, shown_text, transform= fig.transFigure)
ax.legend(legend)
ax.grid()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment