Dhruv Narayanan dhruv1394

## index.css
body{

	margin: 0;

	padding: 0;

	text-align: center;

	background: url(Home_Credit/hcdr.png);

## index.html
<!DOCTYPE html>

<html>

<head>

	<title>Predict Loan Default Risk</title>

</head>

## app.py
import pandas as pd
import re
import time
import numpy as np
import gc
import lightgbm as lgb
import math
import pickle
import os
import os.path

## pickle_usage.py
#saving the variable to hard drive using a pickle file
with open('lgbm/lgbm_train_predict_500f.pkl','wb') as f:
     pickle.dump(train_predict, f)

#loading the pickle file back to memory
with open('lgbm/lgbm_train_predict_500f.pkl','rb') as f:
    train_predict = pickle.load(f)

## kaggle_format_lightgbm.py
features_top_df_test['SK_ID_CURR'] = test_data['SK_ID_CURR']
features_top_df_test['TARGET'] = test_predict
features_top_df_test['SK_ID_CURR'] = features_top_df_test['SK_ID_CURR'].apply(lambda x: np.int32(x))
features_top_df_test[['SK_ID_CURR', 'TARGET']].to_csv('hcdr_lgbm_500f_final.csv', index= False)

## lightgbm_pickle.py
with open('lgbm/lgbm_train_predict_500f.pkl','rb') as f:
    train_predict = pickle.load(f)

with open('lgbm/lgbm_cv_predict_500f.pkl','rb') as f:
    cv_predict = pickle.load(f)

with open('lgbm/lgbm_test_predict_500f.pkl','rb') as f:
    test_predict = pickle.load(f)

with open('lgbm/lgbm_best_threshold_500f_api.pkl','rb') as f:

## lightgbm_roc_curve.py
start = datetime.now()

train_fpr5, train_tpr5, tr_thresholds5 = roc_curve(y, train_predict)
cv_fpr5,cv_tpr5, cv_thresholds5 = roc_curve(y, cv_predict)

plt.plot(train_fpr5,train_tpr5, label ="Training Data AUC :" + str(auc(train_fpr5,train_tpr5)))
plt.plot(cv_fpr5,cv_tpr5,label="CV Data AUC :" + str(auc(cv_fpr5,cv_tpr5)))
plt.legend()

plt.xlabel("FPR Values")

## using_lightgbm.py
start = datetime.now()

a=0
for i,(train, cv) in enumerate(f.split(train_df[feats],y)):
    X_train, Y_train = train_df[feats].iloc[train], y.iloc[train]
    X_valid, Y_valid = train_df[feats].iloc[cv], y.iloc[cv]

    lgb = LGBMClassifier(
            n_estimators=10837, \
            bagging_fraction= 0.7327318230470493, \

## J_statistic_computation.py
fpr_t, tpr_t, thresh = roc_curve(Y_train, train_predict[train])
best_stat = tpr_t - fpr_t
best_thresh_index = np.argmax(best_stat)
best_threshold_train += thresh[best_thresh_index]/5

## 5_fold_cross_validation.py
f =  KFold(n_splits=5,shuffle=True,random_state=0) #K fold cross validation
lgbm_df = pd.DataFrame()

train_df = train_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test_df = test_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
y = train_df['TARGET']

feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU',\
                                                  'SK_ID_PREV','index']]
lgbm_df['feat']=feats
	body{

	margin: 0;

	padding: 0;

	text-align: center;

	background: url(Home_Credit/hcdr.png);
	<!DOCTYPE html>

	<html>

	<head>

	<title>Predict Loan Default Risk</title>

	</head>
	import pandas as pd
	import re
	import time
	import numpy as np
	import gc
	import lightgbm as lgb
	import math
	import pickle
	import os
	import os.path
	#saving the variable to hard drive using a pickle file
	with open('lgbm/lgbm_train_predict_500f.pkl','wb') as f:
	pickle.dump(train_predict, f)

	#loading the pickle file back to memory
	with open('lgbm/lgbm_train_predict_500f.pkl','rb') as f:
	train_predict = pickle.load(f)
	features_top_df_test['SK_ID_CURR'] = test_data['SK_ID_CURR']
	features_top_df_test['TARGET'] = test_predict
	features_top_df_test['SK_ID_CURR'] = features_top_df_test['SK_ID_CURR'].apply(lambda x: np.int32(x))
	features_top_df_test[['SK_ID_CURR', 'TARGET']].to_csv('hcdr_lgbm_500f_final.csv', index= False)
	with open('lgbm/lgbm_train_predict_500f.pkl','rb') as f:
	train_predict = pickle.load(f)

	with open('lgbm/lgbm_cv_predict_500f.pkl','rb') as f:
	cv_predict = pickle.load(f)

	with open('lgbm/lgbm_test_predict_500f.pkl','rb') as f:
	test_predict = pickle.load(f)

	with open('lgbm/lgbm_best_threshold_500f_api.pkl','rb') as f:
	start = datetime.now()

	train_fpr5, train_tpr5, tr_thresholds5 = roc_curve(y, train_predict)
	cv_fpr5,cv_tpr5, cv_thresholds5 = roc_curve(y, cv_predict)

	plt.plot(train_fpr5,train_tpr5, label ="Training Data AUC :" + str(auc(train_fpr5,train_tpr5)))
	plt.plot(cv_fpr5,cv_tpr5,label="CV Data AUC :" + str(auc(cv_fpr5,cv_tpr5)))
	plt.legend()

	plt.xlabel("FPR Values")
	start = datetime.now()

	a=0
	for i,(train, cv) in enumerate(f.split(train_df[feats],y)):
	X_train, Y_train = train_df[feats].iloc[train], y.iloc[train]
	X_valid, Y_valid = train_df[feats].iloc[cv], y.iloc[cv]

	lgb = LGBMClassifier(
	n_estimators=10837, \
	bagging_fraction= 0.7327318230470493, \
	fpr_t, tpr_t, thresh = roc_curve(Y_train, train_predict[train])
	best_stat = tpr_t - fpr_t
	best_thresh_index = np.argmax(best_stat)
	best_threshold_train += thresh[best_thresh_index]/5
	f = KFold(n_splits=5,shuffle=True,random_state=0) #K fold cross validation
	lgbm_df = pd.DataFrame()

	train_df = train_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
	test_df = test_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
	y = train_df['TARGET']

	feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU',\
	'SK_ID_PREV','index']]
	lgbm_df['feat']=feats