Skip to content

Instantly share code, notes, and snippets.

View vannguyen3007's full-sized avatar
๐Ÿˆ
Focusing

IMei vannguyen3007

๐Ÿˆ
Focusing
View GitHub Profile
@vannguyen3007
vannguyen3007 / model_crawler.py
Created October 5, 2020 10:16
Model_crawler
import datetime
from flask_sqlalchemy import SQLAlchemy
from sqlalchemy import Column, DateTime, BigInteger, String, Text
db_crawler = SQLAlchemy()
class Company(db_crawler.Model):
__tablename__ = 'company_lists'
id = Column(BigInteger, primary_key=True)
@vannguyen3007
vannguyen3007 / feature_importance.py
Created October 5, 2020 08:24
feature_importance
lgbm_params = {
'nthread': 4,
'n_estimators': 10000,
'learning_rate': .02,
'num_leaves': 34,
'colsample_bytree': .9497036,
'subsample': .8715623,
'max_depth': 8,
'reg_alpha': .041545473,
'reg_lambda': .0735294,
@vannguyen3007
vannguyen3007 / def_folds.py
Created October 5, 2020 08:21
display_folds_importances
def display_folds_importances(feature_importance_df_, n_folds = 5):
n_columns = 3
n_rows = (n_folds + 1) // n_columns
_, axes = plt.subplots(n_rows, n_columns, figsize=(8 * n_columns, 8 * n_rows))
for i in range(n_folds):
sns.barplot(x = i, y = 'index', data = feature_importance_df_.reset_index().sort_values(i, ascending = False).head(20),
ax = axes[i // n_columns, i % n_columns])
sns.barplot(x = 'mean', y = 'index', data = feature_importance_df_.reset_index().sort_values('mean', ascending = False).head(20),
ax = axes[n_rows - 1, n_columns - 1])
plt.title('LightGBM Features (avg over folds)')
@vannguyen3007
vannguyen3007 / function_optimize.py
Created October 5, 2020 08:19
Bayesian optimization
def cv_scores(df, num_folds, params, stratified = False, verbose = -1,
save_train_prediction = False, train_prediction_file_name = 'train_prediction.csv',
save_test_prediction = True, test_prediction_file_name = 'test_prediction.csv'):
warnings.simplefilter('ignore')
clf = LGBMClassifier(**params)
# Divide in training/validation and test data
train_df = df[df['TARGET'].notnull()]
test_df = df[df['TARGET'].isnull()]
@vannguyen3007
vannguyen3007 / missing_value.py
Created October 4, 2020 09:41
Remove missing value
# Function to calculate missing values by column# Funct // credits Will Koehrsen
def missing_values_table(df):
# Total missing values
mis_val = df.isnull().sum()
# Percentage of missing values
mis_val_percent = 100 * df.isnull().sum() / len(df)
# Make a table with the results
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
@vannguyen3007
vannguyen3007 / Read_data.py
Created October 4, 2020 09:31
Load and read data
import os
file_path = 'Data/'
print(os.listdir(file_path))
app_train = pd.read_csv(file_path + 'application_train.csv')
app_test = pd.read_csv(file_path + 'application_test.csv')
@vannguyen3007
vannguyen3007 / package.py
Created October 4, 2020 09:23
important_package
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import time
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
from sklearn.metrics import roc_auc_score, precision_score, recall_score
from sklearn.model_selection import KFold, StratifiedKFold
@vannguyen3007
vannguyen3007 / maximum_voting.py
Created October 2, 2020 11:43
Maximum_voting_models
vclf = VotingClassifier(estimators=[('lr', calib_clf_LR), ('svc', calib_clf_SVM), ('rf', calib_clf_RF)], voting='soft', n_jobs = -1)
vclf.fit(Train_X, Train_Y)
print("Log loss (train) on the VotingClassifier :"+str(np.round(log_loss(Train_Y, vclf.predict_proba(Train_X), labels=vclf.classes_), 2)))
print("Log loss (CV) on the VotingClassifier :"+str(np.round(log_loss(CV_Y, vclf.predict_proba(CV_X), labels=vclf.classes_), 2)))
print("Log loss (test) on the VotingClassifier :"+str(np.round(log_loss(Test_Y, vclf.predict_proba(Test_X), labels=vclf.classes_), 2)))
print("Percentage of mis-classified for cv points :"+str(np.round((np.count_nonzero(vclf.predict(CV_X) - CV_Y)/CV_X.shape[0]*100), 2))+"%")
print("Percentage of mis-classified for Test points :"+str(np.round((np.count_nonzero(vclf.predict(Test_X) - Test_Y)/Test_X.shape[0]*100), 2))+"%")
print_confusionMatrix(Test_Y, vclf.predict(Test_X))
@vannguyen3007
vannguyen3007 / predictions.py
Created October 2, 2020 11:40
Making predictions
# Get results
best_alpha = alpha[np.argmin(cv_log_loss)]
lr = SGDClassifier(loss = "log", alpha = best_alpha)
stack_clf = StackingClassifier(classifiers=[calib_clf_NB, calib_clf_LR, calib_clf_SVM, calib_clf_RF], meta_classifier=lr, use_probas=True)
stack_clf.fit(Train_X, Train_Y)
trainLogLoss = log_loss(Train_Y, stack_clf.predict_proba(Train_X))
print("Train Log Loss on Stacking Classifier = "+str(np.round(trainLogLoss, 4)))
@vannguyen3007
vannguyen3007 / Stacking_Classifier.py
Created October 2, 2020 11:32
Stacking Classifier
cv_log_loss = []
alpha = [10**x for x in range(-3, 0)]
for i in alpha:
lr = SGDClassifier(loss = "log", alpha = i)
stack_clf = StackingClassifier(classifiers=[calib_clf_NB, calib_clf_LR, calib_clf_SVM, calib_clf_RF], meta_classifier=lr, use_probas=True)
stack_clf.fit(Train_X, Train_Y)
cv_log_loss.append(log_loss(CV_Y, stack_clf.predict_proba(CV_X)))
print("Stacking Classifer : For alpha value: "+str(i)+" Log Loss: "+str(np.round(log_loss(CV_Y, stack_clf.predict_proba(CV_X)), 4)))