Satsawat Natakarnkitkul (Net) netsatsawat

## quick_look.py
import pandas as pd

data_df = pd.read_csv('../data/WA_Fn-UseC_-HR-Employee-Attrition.csv')
print(data_df.shape)
display(data_df.describe())
print(data_df.isnull().sum())
one_uniq = dict()
for col in data_df.columns:
    if len(data_df[col].unique()) == 1:
        one_uniq.update({col: data_df[col].unique().tolist()})

## plot_categorical.py
def plot_categorical(df: pd.DataFrame , col:str):
    """
    Function to plot the categorical data on piechart using Plotly
    @Args:
      df: pandas data frame
      col: A string column name within pandas data frame to plot

    Return:
      No object return, only visualization
    """

## Work_life_viz.py
_tmp_order = ['Bad', 'Good', 'Better', 'Best']

f, axes = plt.subplots(2, 2, figsize=(14, 14))
sns.countplot(x='WorkLifeBalance', hue='Attrition', palette={'Yes': 'r', 'No': 'skyblue'},
              data=data_df, order=_tmp_order, ax=axes[0][0])
axes[0][0].set_title('Overall Work Life Balance')
sns.boxplot(x='WorkLifeBalance', y='DistanceFromHome', hue='Attrition',
            palette={'Yes': 'r', 'No': 'skyblue'},
            data=data_df, order=_tmp_order, ax=axes[0][1])
axes[0][1].set_title('Compare with distance from home')

## hr_feature_engineering.py
def create_generation_feature(age_val: int) -> str:
    """
    Function to convert age value onto generation string
    @Args:
      age_val (int): the age value from data frame

    Return:
      String output specifies the generation
    """
    out = ''

## check_target_label.py
plt.figure(figsize=(10, 6))
total_ = float(len(y_train))
ax = sns.countplot(y_train)
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width() / 2.,
            height + 10,
            '{0:1.1%}'.format(height / total_),
            ha='center')
plt.title('Training label distribution', fontsize=16)

## prediction_evaluation.py
def prediction_evaluation (algorithm, X_train, X_test, y_train, y_test,
                           predictor_cols, cf = 'features'):
    """
     Function to predict and evaluate the provided algorithm by using Plotly library
       to visualize the confusion matrix, ROC curve as well as provided the feature importances.
     @Args:
       algorithm: the model algorithm object
       X_train: the predictor features of the training pandas data frame
       X_test: the predictor features of the testing pandas data frame
       y_train: the target variable of the training pandas data frame

## decision_tree.py
tree_clf = tree.DecisionTreeClassifier(random_state=SEED, max_depth=3)
_ = myUtilityFunction.prediction_evaluation(tree_clf, X_train, X_test,
                                            y_train, y_test, X_train.columns,
                                            'features')

dot_data = tree.export_graphviz(tree_clf, out_file=None,
                                feature_names=X_train.columns,
                                class_names=['No', 'Yes'],
                                filled=True, rounded=True,
                                special_characters=True)

## xgboost_classifier.py
xgb_clf = xgboost.XGBClassifier(random_state=SEED, n_jobs=-1, learning_rate=0.1,
                                max_depth=3, n_estimators=100)
_ = myUtilityFunction.prediction_evaluation(xgb_clf, X_train, X_test,
                                            y_train, y_test, X_train.columns, "features")

## hyperparameter_tuning_xgboost.py
from sklearn.model_selection import RandomizedSearchCV

xgb_clf = xgboost.XGBClassifier(random_state=SEED, n_jobs=-1)
params = {'n_estimators': [50, 100, 200, 300],
          'learning_rate': [0.01, 0.05, 0.1, 0.15],
          'min_child_weight': [1, 2, 3, 5, 10],
          'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 1],
          'subsample': [0.6, 0.7, 0.8],
          'colsample_bytree': [0.6, 0.7, 0.8],
          'max_depth': [3, 4, 5],

## logistic_regression.py
cv_params = {'C': [0.001, 0.01, 0.1, 1., 10., 100.],
             'penalty': ['l1', 'l2'],
             'class_weight': [None, 'balanced']
            }
fix_params = {'random_state': SEED}
log_cv_1 = GridSearchCV(LogisticRegression(**fix_params), cv_params, scoring='f1', cv=5)
log_cv_1.fit(X_train, y_train)
log_clf_all = LogisticRegression(**{**fix_params, **log_cv_1.best_params_})
_ = myUtilityFunction.prediction_evaluation(log_clf_all, X_train, X_test, y_train, y_test,
                                            X_train.columns, "coefficients")
	import pandas as pd

	data_df = pd.read_csv('../data/WA_Fn-UseC_-HR-Employee-Attrition.csv')
	print(data_df.shape)
	display(data_df.describe())
	print(data_df.isnull().sum())
	one_uniq = dict()
	for col in data_df.columns:
	if len(data_df[col].unique()) == 1:
	one_uniq.update({col: data_df[col].unique().tolist()})
	def plot_categorical(df: pd.DataFrame , col:str):
	"""
	Function to plot the categorical data on piechart using Plotly
	@Args:
	df: pandas data frame
	col: A string column name within pandas data frame to plot

	Return:
	No object return, only visualization
	"""
	_tmp_order = ['Bad', 'Good', 'Better', 'Best']

	f, axes = plt.subplots(2, 2, figsize=(14, 14))
	sns.countplot(x='WorkLifeBalance', hue='Attrition', palette={'Yes': 'r', 'No': 'skyblue'},
	data=data_df, order=_tmp_order, ax=axes[0][0])
	axes[0][0].set_title('Overall Work Life Balance')
	sns.boxplot(x='WorkLifeBalance', y='DistanceFromHome', hue='Attrition',
	palette={'Yes': 'r', 'No': 'skyblue'},
	data=data_df, order=_tmp_order, ax=axes[0][1])
	axes[0][1].set_title('Compare with distance from home')
	def create_generation_feature(age_val: int) -> str:
	"""
	Function to convert age value onto generation string
	@Args:
	age_val (int): the age value from data frame

	Return:
	String output specifies the generation
	"""
	out = ''
	plt.figure(figsize=(10, 6))
	total_ = float(len(y_train))
	ax = sns.countplot(y_train)
	for p in ax.patches:
	height = p.get_height()
	ax.text(p.get_x() + p.get_width() / 2.,
	height + 10,
	'{0:1.1%}'.format(height / total_),
	ha='center')
	plt.title('Training label distribution', fontsize=16)
	def prediction_evaluation (algorithm, X_train, X_test, y_train, y_test,
	predictor_cols, cf = 'features'):
	"""
	Function to predict and evaluate the provided algorithm by using Plotly library
	to visualize the confusion matrix, ROC curve as well as provided the feature importances.
	@Args:
	algorithm: the model algorithm object
	X_train: the predictor features of the training pandas data frame
	X_test: the predictor features of the testing pandas data frame
	y_train: the target variable of the training pandas data frame
	tree_clf = tree.DecisionTreeClassifier(random_state=SEED, max_depth=3)
	_ = myUtilityFunction.prediction_evaluation(tree_clf, X_train, X_test,
	y_train, y_test, X_train.columns,
	'features')

	dot_data = tree.export_graphviz(tree_clf, out_file=None,
	feature_names=X_train.columns,
	class_names=['No', 'Yes'],
	filled=True, rounded=True,
	special_characters=True)
	xgb_clf = xgboost.XGBClassifier(random_state=SEED, n_jobs=-1, learning_rate=0.1,
	max_depth=3, n_estimators=100)
	_ = myUtilityFunction.prediction_evaluation(xgb_clf, X_train, X_test,
	y_train, y_test, X_train.columns, "features")
	from sklearn.model_selection import RandomizedSearchCV

	xgb_clf = xgboost.XGBClassifier(random_state=SEED, n_jobs=-1)
	params = {'n_estimators': [50, 100, 200, 300],
	'learning_rate': [0.01, 0.05, 0.1, 0.15],
	'min_child_weight': [1, 2, 3, 5, 10],
	'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 1],
	'subsample': [0.6, 0.7, 0.8],
	'colsample_bytree': [0.6, 0.7, 0.8],
	'max_depth': [3, 4, 5],
	cv_params = {'C': [0.001, 0.01, 0.1, 1., 10., 100.],
	'penalty': ['l1', 'l2'],
	'class_weight': [None, 'balanced']
	}
	fix_params = {'random_state': SEED}
	log_cv_1 = GridSearchCV(LogisticRegression(**fix_params), cv_params, scoring='f1', cv=5)
	log_cv_1.fit(X_train, y_train)
	log_clf_all = LogisticRegression({fix_params, **log_cv_1.best_params_})
	_ = myUtilityFunction.prediction_evaluation(log_clf_all, X_train, X_test, y_train, y_test,
	X_train.columns, "coefficients")