Created
May 13, 2019 14:21
-
-
Save abdel1979/60eeb01228709c4557f0e287b6fa21b0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np # For mathematical calculations | |
import seaborn as sns # For data visualization | |
import matplotlib.pyplot as plt # For plotting graphs | |
import warnings # To ignore any warnings | |
warnings.filterwarnings("ignore") | |
train=pd.read_csv("training.csv") | |
test=pd.read_csv("test.csv") | |
train_original=pd.read_csv("training.csv") | |
test_original=pd.read_csv("test.csv") | |
print(train.columns) | |
print(train.dtypes) | |
print(train.shape,test.shape) | |
# analyse data | |
print(train['Loan_Status'].value_counts()) | |
print(train['Loan_Status'].value_counts(normalize=True)) | |
train['Loan_Status'].value_counts().plot.bar() | |
#plt.show() | |
plt.subplot(221) | |
train['Gender'].value_counts(normalize=True).plot.bar(figsize=(20,10), title= 'Gender') | |
plt.subplot(222) | |
train['Married'].value_counts(normalize=True).plot.bar(title= 'Married') | |
plt.subplot(223) | |
train['Self_Employed'].value_counts(normalize=True).plot.bar(title= 'S-E') | |
plt.subplot(224) | |
train['Credit_History'].value_counts(normalize=True).plot.bar(title= 'C-H') | |
#plt.show() | |
plt.subplot(131) | |
train['Dependents'].value_counts(normalize=True).plot.bar(figsize=(24,6), title= 'Dependents') | |
plt.subplot(132) | |
train['Education'].value_counts(normalize=True).plot.bar(title= 'Education') | |
plt.subplot(133) | |
train['Property_Area'].value_counts(normalize=True).plot.bar(title= 'Property_Area') | |
#plt.show() | |
plt.subplot(121) | |
sns.distplot(train['ApplicantIncome']); | |
plt.subplot(122) | |
train['ApplicantIncome'].plot.box(figsize=(16,5)) | |
#plt.show() | |
train.boxplot(column='ApplicantIncome', by = 'Education') | |
plt.suptitle("boxplot") | |
#plt.show() | |
plt.subplot(121) | |
sns.distplot(train['CoapplicantIncome']); | |
plt.subplot(122) | |
train['CoapplicantIncome'].plot.box(figsize=(16,5)) | |
#plt.show() | |
plt.subplot(121) | |
df=train.dropna() | |
sns.distplot(df['LoanAmount']); | |
plt.subplot(122) | |
train['LoanAmount'].plot.box(figsize=(16,5)) | |
#plt.show() | |
Gender=pd.crosstab(train['Gender'],train['Loan_Status']) | |
Gender=pd.crosstab(train['Gender'],train['Loan_Status']) | |
Married=pd.crosstab(train['Married'],train['Loan_Status']) | |
Dependents=pd.crosstab(train['Dependents'],train['Loan_Status']) | |
Education=pd.crosstab(train['Education'],train['Loan_Status']) | |
Self_Employed=pd.crosstab(train['Self_Employed'],train['Loan_Status']) | |
Credit_History=pd.crosstab(train['Credit_History'],train['Loan_Status']) | |
Property_Area=pd.crosstab(train['Property_Area'],train['Loan_Status']) | |
Gender.div(Gender.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(4,4)) | |
#plt.show() | |
Married.div(Married.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(4,4)) | |
#plt.show() | |
Dependents.div(Dependents.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True) | |
#plt.show() | |
Education.div(Education.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(4,4)) | |
#plt.show() | |
Self_Employed.div(Self_Employed.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(4,4)) | |
#plt.show() | |
Credit_History.div(Credit_History.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(4,4)) | |
#plt.show() | |
Property_Area.div(Property_Area.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True) | |
#plt.show() | |
train.groupby('Loan_Status')['ApplicantIncome'].mean().plot.bar() | |
#plt.show() | |
df=train.dropna() | |
bins=[0,2500,4000,6000,81000] | |
group=['Low','Average','High', 'Very high'] | |
train['Income_bin']=pd.cut(df['ApplicantIncome'],bins,labels=group) | |
Income_bin=pd.crosstab(train['Income_bin'],train['Loan_Status']) | |
Income_bin.div(Income_bin.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True,figsize=(10,4)) | |
#plt.show() | |
bins=[0,1000,3000,42000] | |
group=['Low','Average','High'] | |
train['Coapplicant_Income_bin']=pd.cut(df['CoapplicantIncome'],bins,labels=group) | |
Coapplicant_Income_bin=pd.crosstab(train['Coapplicant_Income_bin'],train['Loan_Status']) | |
Coapplicant_Income_bin.div(Coapplicant_Income_bin.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True) | |
#plt.show() | |
train['Total_Income']=train['ApplicantIncome']+train['CoapplicantIncome'] | |
bins=[0,2500,4000,6000,81000] | |
group=['Low','Average','High', 'Very high'] | |
train['Total_Income_bin']=pd.cut(train['Total_Income'],bins,labels=group) | |
Total_Income_bin=pd.crosstab(train['Total_Income_bin'],train['Loan_Status']) | |
Total_Income_bin.div(Total_Income_bin.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True) | |
plt.xlabel('Total_Income') | |
plt.ylabel('Percentage') | |
#plt.show() | |
bins=[0,100,200,700] | |
group=['Low','Average','High'] | |
train['LoanAmount_bin']=pd.cut(df['LoanAmount'],bins,labels=group) | |
LoanAmount_bin=pd.crosstab(train['LoanAmount_bin'],train['Loan_Status']) | |
LoanAmount_bin.div(LoanAmount_bin.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True) | |
plt.xlabel('LoanAmount') | |
P = plt.ylabel('Percentage') | |
train=train.drop(['Income_bin', 'Coapplicant_Income_bin', 'LoanAmount_bin', 'Total_Income_bin', 'Total_Income'], axis=1) | |
train['Dependents'].replace('3+', 3,inplace=True) | |
test['Dependents'].replace('3+', 3,inplace=True) | |
train['Loan_Status'].replace('N', 0,inplace=True) | |
train['Loan_Status'].replace('Y', 1,inplace=True) | |
matrix = train.corr() | |
f, ax = plt.subplots(figsize=(9, 6)) | |
sns.heatmap(matrix, vmax=.8, square=True, cmap="BuPu"); | |
#plt.show() | |
print(train.isnull().sum()) | |
train['Gender'].fillna(train['Gender'].mode()[0], inplace=True) | |
train['Married'].fillna(train['Married'].mode()[0], inplace=True) | |
train['Dependents'].fillna(train['Dependents'].mode()[0], inplace=True) | |
train['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True) | |
train['Credit_History'].fillna(train['Credit_History'].mode()[0], inplace=True) | |
train['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0], inplace=True) | |
train['LoanAmount'].fillna(train['LoanAmount'].median(), inplace=True) | |
print('-------') | |
print(train.isnull().sum()) | |
test['Gender'].fillna(train['Gender'].mode()[0], inplace=True) | |
test['Dependents'].fillna(train['Dependents'].mode()[0], inplace=True) | |
test['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True) | |
test['Credit_History'].fillna(train['Credit_History'].mode()[0], inplace=True) | |
test['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0], inplace=True) | |
test['LoanAmount'].fillna(train['LoanAmount'].median(), inplace=True) | |
train['LoanAmount_log'] = np.log(train['LoanAmount']) | |
train['LoanAmount_log'].hist(bins=20) | |
test['LoanAmount_log'] = np.log(test['LoanAmount']) | |
#plt.show() | |
train=train.drop('Loan_ID',axis=1) | |
test=test.drop('Loan_ID',axis=1) | |
X = train.drop('Loan_Status',1) | |
y = train.Loan_Status | |
#dummies | |
X=pd.get_dummies(X) | |
train=pd.get_dummies(train) | |
test=pd.get_dummies(test) | |
from sklearn.model_selection import train_test_split | |
x_train, x_cv, y_train, y_cv = train_test_split(X,y, test_size =0.3) | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.metrics import accuracy_score | |
model = LogisticRegression() | |
model.fit(x_train, y_train) | |
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, | |
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, | |
penalty='l2', random_state=1, solver='liblinear', tol=0.0001, | |
verbose=0, warm_start=False) | |
pred_cv = model.predict(x_cv) | |
accuracy_score(y_cv,pred_cv) | |
pred_test = model.predict(test) | |
submission=pd.read_csv("Sample_Submission_ZAuTl8O_FK3zQHh.csv") | |
submission['Loan_Status']=pred_test | |
submission['Loan_ID']=test_original['Loan_ID'] | |
submission['Loan_Status'].replace(0, 'N',inplace=True) | |
submission['Loan_Status'].replace(1, 'Y',inplace=True) | |
pd.DataFrame(submission, columns=['Loan_ID','Loan_Status']).to_csv('logistic.csv') | |
from sklearn.model_selection import StratifiedKFold | |
i = 1 | |
kf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True) | |
for train_index, test_index in kf.split(X, y): | |
print('\n{} of kfold {}'.format(i, kf.n_splits)) | |
xtr, xvl = X.loc[train_index], X.loc[test_index] | |
ytr, yvl = y[train_index], y[test_index] | |
model = LogisticRegression(random_state=1) | |
model.fit(xtr, ytr) | |
pred_test = model.predict(xvl) | |
score = accuracy_score(yvl, pred_test) | |
print('accuracy_score', score) | |
i += 1 | |
pred_test = model.predict(test) | |
pred = model.predict_proba(xvl)[:, 1] | |
submission['Loan_Status']=pred_test | |
submission['Loan_ID']=test_original['Loan_ID'] | |
submission['Loan_Status'].replace(0, 'N',inplace=True) | |
submission['Loan_Status'].replace(1, 'Y',inplace=True) | |
pd.DataFrame(submission, columns=['Loan_ID','Loan_Status']).to_csv('Logistic2.csv') | |
train['Total_Income']=train['ApplicantIncome']+train['CoapplicantIncome'] | |
test['Total_Income']=test['ApplicantIncome']+test['CoapplicantIncome'] | |
train['Total_Income_log'] = np.log(train['Total_Income']) | |
sns.distplot(train['Total_Income_log']); | |
test['Total_Income_log'] = np.log(test['Total_Income']) | |
train['EMI']=train['LoanAmount']/train['Loan_Amount_Term'] | |
test['EMI']=test['LoanAmount']/test['Loan_Amount_Term'] | |
train['Balance Income']=train['Total_Income']-(train['EMI']*1000) | |
test['Balance Income']=test['Total_Income']-(test['EMI']*1000) | |
train=train.drop(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term'], axis=1) | |
test=test.drop(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term'], axis=1) | |
# optimisation | |
# logistic regression | |
X = train.drop('Loan_Status',1) | |
y = train.Loan_Status | |
i = 1 | |
kf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True) | |
for train_index, test_index in kf.split(X, y): | |
print('\n{} of kfold {}'.format(i, kf.n_splits)) | |
xtr, xvl = X.loc[train_index], X.loc[test_index] | |
ytr, yvl = y[train_index], y[test_index] | |
model = LogisticRegression(random_state=1) | |
model.fit(xtr, ytr) | |
pred_test = model.predict(xvl) | |
score = accuracy_score(yvl, pred_test) | |
#print('accuracy_score', score) | |
i += 1 | |
pred_test = model.predict(test) | |
pred = model.predict_proba(xvl)[:, 1] | |
submission['Loan_Status']=pred_test # filling Loan_Status with predictions | |
submission['Loan_ID']=test_original['Loan_ID'] # filling Loan_ID with test Loan_ID | |
# replacing 0 and 1 with N and Y | |
submission['Loan_Status'].replace(0, 'N',inplace=True) | |
submission['Loan_Status'].replace(1, 'Y',inplace=True) | |
# Converting submission file to .csv format | |
pd.DataFrame(submission, columns=['Loan_ID','Loan_Status']).to_csv('Log-new1.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment