Created
November 18, 2013 03:43
-
-
Save ansakoy/7522116 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Final Lending Club Analysis### | |
## This analysis was performed using Windows 7 and RStudio version 3.0.1 ## | |
# Download data | |
fileUrl <- "http://spark-public.s3.amazonaws.com/dataanalysis/loansData.csv" | |
download.file(fileUrl, destfile = "loansData.csv") | |
# Read data into R | |
loansData <- read.csv("loansData.csv") | |
## Reshaping data ## | |
# Converting values from character to numeric where necessary | |
loansData$Interest.Rate <- as.numeric(strsplit(as.character(loansData$Interest.Rate), "\\%")) | |
loansData$Debt.To.Income.Ratio <- as.numeric(strsplit(as.character(loansData$Debt.To.Income.Ratio), "\\%")) | |
# Add a new column with FICO range means | |
minFICO <- as.numeric(lapply(strsplit(as.character(loansData$FICO.Range), "-"), "[",1)) | |
maxFICO <- as.numeric(lapply(strsplit(as.character(loansData$FICO.Range), "-"), "[",2)) | |
loansData$FICO.Mean <- (minFICO + maxFICO) / 2 | |
## Exploratory analysis ## | |
# Find the number of missing values | |
sum(is.na(loansData)) | |
# These values are actually concentrated in two observations: | |
obsedvationsNA <- loansData[c(367, 1595),] | |
# As these observations do not contain any particular extremes, we shall exclude them from our future analysis | |
loansData <- na.omit(loansData) | |
# Look at the summary statistics of interest rate | |
summary(loansData$Interest.Rate) | |
# mean value is at roughly 13% rate and minimum and maximum values at 5,4% and 24,9% | |
hist(loansData$Interest.Rate, col = "blue") | |
# The distribution looks pretty normal | |
# Look at FICO score distribution | |
hist(loansData$FICO.Mean, col = "blue") | |
# The distribution is strongly right-skewed, which means that within this dataset borrowers tend | |
# to have rather low credit score | |
# Find out the distribution of observations by loan purpose | |
barplot(table(loansData$Loan.Purpose)) | |
# The outstandingly popular purpose (for 1307 observations) is debt consolidation followed by | |
# loans for getting a credit card | |
# Check if there is any difference in interest rate | |
debtCons <- subset(loansData, loansData$Loan.Purpose == "debt_consolidation") | |
summary(debtCons$Interest.Rate) | |
# No particular difference | |
# Find out monthly income distribution and summary | |
summary(loansData$Monthly.Income) | |
hist(loansData$Monthly.Income, col = "blue") | |
# The distribution is extremely right-skewed | |
# Adjust the outliers | |
incomeAdjusted <- loansData$Monthly.Income <= 30000 | |
summary(loansData$Monthly.Income[incomeAdjusted]) | |
hist(loansData$Monthly.Income[incomeAdjusted], col = "blue") | |
# Here we see that the distribution is still right-skewed with its peak around $5000 per month | |
# Look at the summary and distribution of open credit lines | |
summary(loansData$Open.CREDIT.Lines) | |
hist(loansData$Open.CREDIT.Lines, col = "blue") | |
# The distribution is also right-skewed with most observations falling within a range between 5 and 13. | |
# The longer tale is created due to outliers. Adjusted: | |
hist(log10(loansData$Open.CREDIT.Lines + 1), col = "blue") | |
hist(loansData$Open.CREDIT.Lines[loansData$Open.CREDIT.Lines <= 20 & loansData$Open.CREDIT.Lines >= 1], col = "blue") | |
# Revolving credit balance summary and distribution | |
summary(loansData$Revolving.CREDIT.Balance) | |
hist(loansData$Revolving.CREDIT.Balance, col = "blue") | |
# Right-skewed distribution with a mean around 10000 | |
# Inquiries in the Last 6 Months: summary and distribution | |
summary(loansData$Inquiries.in.the.Last.6.Months) | |
hist(loansData$Inquiries.in.the.Last.6.Months, col = "blue") | |
# Right-skewed distribution with over a half of observations have 0 values. | |
# adjust with log base of 10 | |
hist(log10(loansData$Inquiries.in.the.Last.6.Months + 1), col = "blue") | |
# Loan length characteristics | |
table(loansData$Loan.Length) | |
barplot(table(loansData$Loan.Length)) | |
# We see that the number of loans for 36 months is more than twice as big as the number of loans for 60 months | |
# find out if there are any difference in interest rate for these two categories | |
length36 <- subset(loansData, as.character(loansData$Loan.Length) == "36 months") | |
length60 <- subset(loansData, as.character(loansData$Loan.Length) == "60 months") | |
summary(length36$Interest.Rate) | |
summary(length60$Interest.Rate) | |
boxplot(loansData$Interest.Rate ~ loansData$Loan.Length, varwidth = T) | |
# There seems to be a considerable difference between these two categories in terms of interest rate | |
# with interest rate for 35 months loans generally lower than that of 60 months loans | |
# See if it is somehow associated with amount of loan requested. | |
summary(length36$Amount.Requested) | |
summary(length60$Amount.Requested) | |
## Modeling ## | |
# Test the basic FICO score model | |
model0 <- lm(loansData$Interest.Rate ~ loansData$FICO.Mean) | |
summary(model0) # see the summary | |
confint(model0) # using confidence intervals for accuracy | |
plot(loansData$Interest.Rate ~ loansData$FICO.Mean) | |
# Strong negative correlation | |
# Check amount requested model | |
model1 <- lm(loansData$Interest.Rate ~ loansData$Amount.Requested) | |
summary(model1) | |
confint(model1) | |
cor(loansData$Interest.Rate, loansData$Amount.Requested) | |
# Shows some positive correlation of 0.3, which is significant, judging by low p-value and | |
# the fact that confidence intervals do not include 0. About 11% of variance explained | |
# Add recent inquiries | |
model2 <- lm(loansData$Interest.Rate ~ loansData$Amount.Requested + loansData$Inquiries.in.the.Last.6.Months) | |
summary(model2) | |
confint(model2) | |
# This has slightly improved our prediction model. Now we have about 14% of variance explained | |
# Add open credit lines | |
model3 <- lm(loansData$Interest.Rate ~ loansData$Amount.Requested + loansData$Inquiries.in.the.Last.6.Months + | |
loansData$Open.CREDIT.Lines) | |
summary(model3) | |
confint(model3) | |
# Doesn't seem to show any difference | |
model3a <- lm(loansData$Interest.Rate ~ loansData$Open.CREDIT.Lines) | |
summary(model3a) | |
# As doesn't this predictor show taken separately So we are not going to use it in our further modeling | |
# Revolving credit balance | |
model4 <- lm(loansData$Interest.Rate ~ loansData$Amount.Requested + loansData$Inquiries.in.the.Last.6.Months + | |
loansData$Revolving.CREDIT.Balance) | |
summary(model4) | |
# This doesn't make any particular difference either | |
# Add the loan purpose as a predictor | |
# As it is a categorical variable, use dummy coding to include it into the regression equation | |
(codePurpose) <- C(loansData$Loan.Purpose, treatment) | |
model5 <- lm(loansData$Interest.Rate ~ loansData$Amount.Requested + loansData$Inquiries.in.the.Last.6.Months + | |
(codePurpose)) | |
summary(model5) | |
# Despite the extremes in this category, adding it to model doesn't make any particular difference | |
## Adjusting the model ## | |
# Our best model so far seems to be model 2 | |
# Add the predicted scores to the data frame in order to visualize it in a two-dimensional plot | |
loansData$Interest.Rate.Predicted <- fitted(model2) | |
# Taking residuals into account | |
loansData$Residuals <- resid(model2) | |
hist(loansData$Residuals) | |
plot(loansData$Interest.Rate.Predicted ~ loansData$Residuals) | |
abline(lm(loansData$Interest.Rate.Predicted ~ loansData$Residuals), col="blue") | |
# Distribution of the residuals is normal and they show no correlation with predicted scores, | |
# which indicates that the model is trustworthy. | |
## Figure making ## | |
loansData$Length.Color <- gsub("36 months", 4, loansData$Loan.Length) | |
loansData$Length.Color <- gsub("60 months", 2, loansData$Length.Color) | |
plot(loansData$Interest.Rate ~ loansData$Interest.Rate.Predicted, pch = 20, | |
col = loansData$Length.Color, | |
main = "Income rate vs. amount requested and recent credit inquiries", | |
xlab = "Predictors combined", ylab = "Interest rate") | |
legend("bottomright",legend=c("36 months","60 months"),col=c("blue","red"),pch=c(20,20)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment