Skip to content

Instantly share code, notes, and snippets.

View dhruv1394's full-sized avatar
🏠
Working from home

Dhruv Narayanan dhruv1394

🏠
Working from home
View GitHub Profile
body{
margin: 0;
padding: 0;
text-align: center;
background: url(Home_Credit/hcdr.png);
<!DOCTYPE html>
<html>
<head>
<title>Predict Loan Default Risk</title>
</head>
import pandas as pd
import re
import time
import numpy as np
import gc
import lightgbm as lgb
import math
import pickle
import os
import os.path
#saving the variable to hard drive using a pickle file
with open('lgbm/lgbm_train_predict_500f.pkl','wb') as f:
pickle.dump(train_predict, f)
#loading the pickle file back to memory
with open('lgbm/lgbm_train_predict_500f.pkl','rb') as f:
train_predict = pickle.load(f)
features_top_df_test['SK_ID_CURR'] = test_data['SK_ID_CURR']
features_top_df_test['TARGET'] = test_predict
features_top_df_test['SK_ID_CURR'] = features_top_df_test['SK_ID_CURR'].apply(lambda x: np.int32(x))
features_top_df_test[['SK_ID_CURR', 'TARGET']].to_csv('hcdr_lgbm_500f_final.csv', index= False)
with open('lgbm/lgbm_train_predict_500f.pkl','rb') as f:
train_predict = pickle.load(f)
with open('lgbm/lgbm_cv_predict_500f.pkl','rb') as f:
cv_predict = pickle.load(f)
with open('lgbm/lgbm_test_predict_500f.pkl','rb') as f:
test_predict = pickle.load(f)
with open('lgbm/lgbm_best_threshold_500f_api.pkl','rb') as f:
start = datetime.now()
train_fpr5, train_tpr5, tr_thresholds5 = roc_curve(y, train_predict)
cv_fpr5,cv_tpr5, cv_thresholds5 = roc_curve(y, cv_predict)
plt.plot(train_fpr5,train_tpr5, label ="Training Data AUC :" + str(auc(train_fpr5,train_tpr5)))
plt.plot(cv_fpr5,cv_tpr5,label="CV Data AUC :" + str(auc(cv_fpr5,cv_tpr5)))
plt.legend()
plt.xlabel("FPR Values")
start = datetime.now()
a=0
for i,(train, cv) in enumerate(f.split(train_df[feats],y)):
X_train, Y_train = train_df[feats].iloc[train], y.iloc[train]
X_valid, Y_valid = train_df[feats].iloc[cv], y.iloc[cv]
lgb = LGBMClassifier(
n_estimators=10837, \
bagging_fraction= 0.7327318230470493, \
fpr_t, tpr_t, thresh = roc_curve(Y_train, train_predict[train])
best_stat = tpr_t - fpr_t
best_thresh_index = np.argmax(best_stat)
best_threshold_train += thresh[best_thresh_index]/5
f = KFold(n_splits=5,shuffle=True,random_state=0) #K fold cross validation
lgbm_df = pd.DataFrame()
train_df = train_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test_df = test_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
y = train_df['TARGET']
feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU',\
'SK_ID_PREV','index']]
lgbm_df['feat']=feats