Last active
September 6, 2018 12:39
-
-
Save humamfauzi/02bdc18f9a5a89fda29aebc82e07b035 to your computer and use it in GitHub Desktop.
My personal data science tool
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def describe_dataframe(dataframe): | |
# Create a dictionary which have every detected categorical value counts in a dataframe and NaN ratio | |
describe_df = {} | |
for j in dataframe.dtypes.value_counts().index: | |
describe_dict = {} | |
cols = dataframe.select_dtypes(str(j)).columns | |
for i in cols: | |
if dataframe[i].nunique() < 100: | |
describe_dict[str(i)] = {"total_category": dict(dataframe[i].value_counts()), | |
"NaN_ratio": dataframe[i].isnull().sum()/float(dataframe.shape[0])} | |
describe_df[str(j)] = describe_dict | |
print('Total', str(j), 'Classification:', len(describe_dict.keys()), "from", len(cols)) | |
return describe_df | |
#------------------------------ | |
def boundaries_check(df_train, df_test, columns): | |
# Checking for sample train value that lies outside test value and drop the sample | |
# ub stands for upper bound and lb stands for lower bound | |
# we should generalize the target more so it does not stuck in binary classification | |
lb_count = 0 | |
ub_count = 0 | |
for i in columns: | |
train = df_train[i].describe() | |
test = df_test[i].describe() | |
outside_ub = len(df_train[df_train[i] > np.max(df_test[i])]) | |
outside_lb = len(df_train[df_train[i] < np.min(df_test[i])]) | |
if (outside_lb != 0) | (outside_ub != 0): | |
ub_count = ub_count + outside_ub | |
lb_count = lb_count + outside_lb | |
print(i, "::",outside_lb, "<>", outside_ub) | |
df_train = df_train.drop(df_train[df_train[i] > np.max(df_test[i])].index) | |
df_train = df_train.drop(df_train[df_train[i] < np.min(df_test[i])].index) | |
print("Total", ub_count + lb_count, "UB:", ub_count, "LB:", lb_count) | |
print("Affected ratios: ", (ub_count + lb_count)/ float(len(app_train))) | |
return df_train | |
#-------------------------------------------- | |
def disc2disc(df_train, df_test, column): | |
# describing comparison between train and test in every unique value of a column | |
train = dict(df_train[column].value_counts()) | |
test = dict(df_test[column].value_counts()) | |
ratios = {} | |
ratios["train"] = {} | |
ratios["test"] = {} | |
outlist = [] | |
for i in app_train[column].unique(): | |
if i not in app_test[column].unique(): | |
del train[i] | |
for i in app_test[column].unique(): | |
if i not in app_train[column].unique(): | |
del test[i] | |
for i in test.keys(): | |
ratios["train"][str(i)] = train[i] / float(len(df_train)) | |
ratios["test"][str(i)] = test[i] / float(len(df_test)) | |
plt.figure(figsize=(8, (len(test.keys()) + 10 )// 4)) | |
plt.title(column) | |
sns.heatmap(pd.DataFrame(ratios), annot=True, fmt="f", linewidths=0.5, cmap="YlGnBu") | |
#-------------------------------------------- | |
def cont2disc(dataframe, column, target, legend, transformation="vanilla"): | |
fig, ax = plt.subplots(figsize=(18, 8)) | |
skewn = str(dataframe[column].skew()) | |
kurto = str(dataframe[column].kurt()) | |
ax.set_title(" Distribution" ) | |
for i in target: | |
if transformation == "vanilla": | |
sns.distplot(dataframe[column][i]) | |
else: | |
sns.distplot(transformation(dataframe[column][i])) | |
if transformation == "vanilla": | |
shown_text = " ".join(["Skew: " + str(dataframe[column][i].skew())[:4] + | |
" Kurtosis " + str(dataframe[column][i].kurt())[:4] + "\n" | |
for i in target]) | |
else: | |
shown_text = " ".join(["Skew: " + str(transformation(dataframe[column][i]).skew())[:4] + | |
" Kurtosis " + str(transformation(dataframe[column][i]).kurt())[:4] + "\n" | |
for i in target]) | |
ax.text(0.15, 0.8, shown_text, transform= fig.transFigure) | |
ax.legend(legend) | |
ax.grid() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment