Skip to content

Instantly share code, notes, and snippets.

# define dependent and independent variables
# un-transformed dataset
xi = coded_df[[_i for _i in coded_df_vif_iter2.variables]]
yi = coded_df.WORK_LIFE_BALANCE_SCORE
# transformed dataset
xt = coded_df_T[[_i for _i in coded_df_T_vif_iter2.variables]]
yt = coded_df_T.WORK_LIFE_BALANCE_SCORE
#coded_df_T_vif_iter2.sort_values(by='VIF', ascending = False).head(12)
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = [14, 7]
plt.rcParams["figure.autolayout"] = True
f, axes = plt.subplots(1, 2)
# UnTransformed Data
VIF_plot = sns.barplot(x=coded_df_vif_iter2.sort_values(by='VIF', ascending = False)["variables"],
y=coded_df_vif_iter2.sort_values(by='VIF', ascending = False)["VIF"], ax=axes[0])
for item in VIF_plot.get_xticklabels():
# dropping high VIF variables one by one
# calculate VIF
coded_df_vif_iter2 = calc_vif(coded_df.drop(columns = ["WORK_LIFE_BALANCE_SCORE","SLEEP_HOURS","TODO_COMPLETED","SOCIAL_NETWORK","FRUITS_VEGGIES",
"SUPPORTING_OTHERS","PERSONAL_AWARDS","WEEKLY_MEDITATION"]))
coded_df_T_vif_iter2 = calc_vif(coded_df_T.drop(columns = ["WORK_LIFE_BALANCE_SCORE","SLEEP_HOURS_T","ACHIEVEMENT_T","SOCIAL_NETWORK","FRUITS_VEGGIES",
"FLOW_T","TODO_COMPLETED_T","SUPPORTING_OTHERS","WEEKLY_MEDITATION"]))
coded_df_vif_iter2.sort_values(by='VIF', ascending = False).head(12)
# Import library for VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor
def calc_vif(X):
# Calculating VIF
vif = pd.DataFrame()
vif["variables"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
# transformed dataset and non transformed dataset
# non transformed dataset
coded_df.head()
# transformed dataset
coded_df_T = pd.concat((coded_df.drop(columns=[_i for _i in xval[0:8]]),transformed_skew_df), axis = 1)
coded_df_T.head()
skewness_dict = {}
for _i in coded_df.columns:
if coded_df[_i].dtypes == 'int64' or coded_df[_i].dtypes == 'float64':
skewness_dict[_i] = coded_df[_i].skew()
else:
None
# sort based on highest absolute skew
skewness = sorted(skewness_dict.items(), key=lambda kv: abs(kv[1]), reverse = True)
coded_df = pd.concat(
[df.drop(["GENDER", "AGE", "BMI_RANGE","SUFFICIENT_INCOME"],axis=1),
pd.concat([transformed[_i] for _i in list_transformed], axis = 1)],
axis = 1)
coded_df.head()
# dummy variables
df['BMI_RANGE'].replace({1: "BMI < 25", 2: "BMI >= 25"}, inplace = True)
df['SUFFICIENT_INCOME'].replace({1: "INCOME INSUFFICIENT", 2: "INCOME SUFFICIENT"}, inplace = True)
df['AGE'].replace(
{'Less than 20': '0. Less than 20',
'21 to 35': '1. 21 to 35',
'36 to 50': '2. 36 to 50',
'51 or more': '3. 51 or more',
}, inplace = True
)
pd.crosstab(
df['GENDER'],
df['AGE'],
rownames=['Gender'],
colnames=['Age'],
margins = True,
normalize = True
).style.format('{:.2%}')
pd.crosstab(
df['Month'],
df['Year'],
rownames=['Month'],
colnames=['Year'],
margins = True,
normalize = True
).style.format('{:.2%}')