Skip to content

Instantly share code, notes, and snippets.

View netsatsawat's full-sized avatar

Satsawat Natakarnkitkul (Net) netsatsawat

View GitHub Profile
@netsatsawat
netsatsawat / quick_look.py
Last active May 18, 2019 16:23
Read in data and sample check
import pandas as pd
data_df = pd.read_csv('../data/WA_Fn-UseC_-HR-Employee-Attrition.csv')
print(data_df.shape)
display(data_df.describe())
print(data_df.isnull().sum())
one_uniq = dict()
for col in data_df.columns:
if len(data_df[col].unique()) == 1:
one_uniq.update({col: data_df[col].unique().tolist()})
@netsatsawat
netsatsawat / plot_categorical.py
Created May 18, 2019 17:39
Function to plot categorical data
def plot_categorical(df: pd.DataFrame , col:str):
"""
Function to plot the categorical data on piechart using Plotly
@Args:
df: pandas data frame
col: A string column name within pandas data frame to plot
Return:
No object return, only visualization
"""
@netsatsawat
netsatsawat / Work_life_viz.py
Created May 19, 2019 09:36
Visualization for question 3 in HR analytics blog
_tmp_order = ['Bad', 'Good', 'Better', 'Best']
f, axes = plt.subplots(2, 2, figsize=(14, 14))
sns.countplot(x='WorkLifeBalance', hue='Attrition', palette={'Yes': 'r', 'No': 'skyblue'},
data=data_df, order=_tmp_order, ax=axes[0][0])
axes[0][0].set_title('Overall Work Life Balance')
sns.boxplot(x='WorkLifeBalance', y='DistanceFromHome', hue='Attrition',
palette={'Yes': 'r', 'No': 'skyblue'},
data=data_df, order=_tmp_order, ax=axes[0][1])
axes[0][1].set_title('Compare with distance from home')
@netsatsawat
netsatsawat / hr_feature_engineering.py
Created May 19, 2019 11:51
Feature engineering function
def create_generation_feature(age_val: int) -> str:
"""
Function to convert age value onto generation string
@Args:
age_val (int): the age value from data frame
Return:
String output specifies the generation
"""
out = ''
@netsatsawat
netsatsawat / check_target_label.py
Created May 19, 2019 15:40
Check the target variable counts in train data set
plt.figure(figsize=(10, 6))
total_ = float(len(y_train))
ax = sns.countplot(y_train)
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x() + p.get_width() / 2.,
height + 10,
'{0:1.1%}'.format(height / total_),
ha='center')
plt.title('Training label distribution', fontsize=16)
@netsatsawat
netsatsawat / prediction_evaluation.py
Created May 19, 2019 17:29
Function to plot the model evaluation with test data
def prediction_evaluation (algorithm, X_train, X_test, y_train, y_test,
predictor_cols, cf = 'features'):
"""
Function to predict and evaluate the provided algorithm by using Plotly library
to visualize the confusion matrix, ROC curve as well as provided the feature importances.
@Args:
algorithm: the model algorithm object
X_train: the predictor features of the training pandas data frame
X_test: the predictor features of the testing pandas data frame
y_train: the target variable of the training pandas data frame
@netsatsawat
netsatsawat / decision_tree.py
Created May 19, 2019 17:31
Code snippet for decision tree and plot the decision tree to file
tree_clf = tree.DecisionTreeClassifier(random_state=SEED, max_depth=3)
_ = myUtilityFunction.prediction_evaluation(tree_clf, X_train, X_test,
y_train, y_test, X_train.columns,
'features')
dot_data = tree.export_graphviz(tree_clf, out_file=None,
feature_names=X_train.columns,
class_names=['No', 'Yes'],
filled=True, rounded=True,
special_characters=True)
@netsatsawat
netsatsawat / xgboost_classifier.py
Created May 19, 2019 18:20
Code snippet for xgboost and evaluating the performance
xgb_clf = xgboost.XGBClassifier(random_state=SEED, n_jobs=-1, learning_rate=0.1,
max_depth=3, n_estimators=100)
_ = myUtilityFunction.prediction_evaluation(xgb_clf, X_train, X_test,
y_train, y_test, X_train.columns, "features")
@netsatsawat
netsatsawat / hyperparameter_tuning_xgboost.py
Last active May 19, 2019 18:38
Code snippet to optimize the hyperparameters of XGBoost algorithm
from sklearn.model_selection import RandomizedSearchCV
xgb_clf = xgboost.XGBClassifier(random_state=SEED, n_jobs=-1)
params = {'n_estimators': [50, 100, 200, 300],
'learning_rate': [0.01, 0.05, 0.1, 0.15],
'min_child_weight': [1, 2, 3, 5, 10],
'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 1],
'subsample': [0.6, 0.7, 0.8],
'colsample_bytree': [0.6, 0.7, 0.8],
'max_depth': [3, 4, 5],
@netsatsawat
netsatsawat / logistic_regression.py
Created May 19, 2019 20:13
Tune hyperparameters using grid search CV on logit
cv_params = {'C': [0.001, 0.01, 0.1, 1., 10., 100.],
'penalty': ['l1', 'l2'],
'class_weight': [None, 'balanced']
}
fix_params = {'random_state': SEED}
log_cv_1 = GridSearchCV(LogisticRegression(**fix_params), cv_params, scoring='f1', cv=5)
log_cv_1.fit(X_train, y_train)
log_clf_all = LogisticRegression(**{**fix_params, **log_cv_1.best_params_})
_ = myUtilityFunction.prediction_evaluation(log_clf_all, X_train, X_test, y_train, y_test,
X_train.columns, "coefficients")