Skip to content

Instantly share code, notes, and snippets.

@ansakoy
Created November 18, 2013 03:43
Show Gist options
  • Save ansakoy/7522116 to your computer and use it in GitHub Desktop.
Save ansakoy/7522116 to your computer and use it in GitHub Desktop.
### Final Lending Club Analysis###
## This analysis was performed using Windows 7 and RStudio version 3.0.1 ##
# Download data
fileUrl <- "http://spark-public.s3.amazonaws.com/dataanalysis/loansData.csv"
download.file(fileUrl, destfile = "loansData.csv")
# Read data into R
loansData <- read.csv("loansData.csv")
## Reshaping data ##
# Converting values from character to numeric where necessary
loansData$Interest.Rate <- as.numeric(strsplit(as.character(loansData$Interest.Rate), "\\%"))
loansData$Debt.To.Income.Ratio <- as.numeric(strsplit(as.character(loansData$Debt.To.Income.Ratio), "\\%"))
# Add a new column with FICO range means
minFICO <- as.numeric(lapply(strsplit(as.character(loansData$FICO.Range), "-"), "[",1))
maxFICO <- as.numeric(lapply(strsplit(as.character(loansData$FICO.Range), "-"), "[",2))
loansData$FICO.Mean <- (minFICO + maxFICO) / 2
## Exploratory analysis ##
# Find the number of missing values
sum(is.na(loansData))
# These values are actually concentrated in two observations:
obsedvationsNA <- loansData[c(367, 1595),]
# As these observations do not contain any particular extremes, we shall exclude them from our future analysis
loansData <- na.omit(loansData)
# Look at the summary statistics of interest rate
summary(loansData$Interest.Rate)
# mean value is at roughly 13% rate and minimum and maximum values at 5,4% and 24,9%
hist(loansData$Interest.Rate, col = "blue")
# The distribution looks pretty normal
# Look at FICO score distribution
hist(loansData$FICO.Mean, col = "blue")
# The distribution is strongly right-skewed, which means that within this dataset borrowers tend
# to have rather low credit score
# Find out the distribution of observations by loan purpose
barplot(table(loansData$Loan.Purpose))
# The outstandingly popular purpose (for 1307 observations) is debt consolidation followed by
# loans for getting a credit card
# Check if there is any difference in interest rate
debtCons <- subset(loansData, loansData$Loan.Purpose == "debt_consolidation")
summary(debtCons$Interest.Rate)
# No particular difference
# Find out monthly income distribution and summary
summary(loansData$Monthly.Income)
hist(loansData$Monthly.Income, col = "blue")
# The distribution is extremely right-skewed
# Adjust the outliers
incomeAdjusted <- loansData$Monthly.Income <= 30000
summary(loansData$Monthly.Income[incomeAdjusted])
hist(loansData$Monthly.Income[incomeAdjusted], col = "blue")
# Here we see that the distribution is still right-skewed with its peak around $5000 per month
# Look at the summary and distribution of open credit lines
summary(loansData$Open.CREDIT.Lines)
hist(loansData$Open.CREDIT.Lines, col = "blue")
# The distribution is also right-skewed with most observations falling within a range between 5 and 13.
# The longer tale is created due to outliers. Adjusted:
hist(log10(loansData$Open.CREDIT.Lines + 1), col = "blue")
hist(loansData$Open.CREDIT.Lines[loansData$Open.CREDIT.Lines <= 20 & loansData$Open.CREDIT.Lines >= 1], col = "blue")
# Revolving credit balance summary and distribution
summary(loansData$Revolving.CREDIT.Balance)
hist(loansData$Revolving.CREDIT.Balance, col = "blue")
# Right-skewed distribution with a mean around 10000
# Inquiries in the Last 6 Months: summary and distribution
summary(loansData$Inquiries.in.the.Last.6.Months)
hist(loansData$Inquiries.in.the.Last.6.Months, col = "blue")
# Right-skewed distribution with over a half of observations have 0 values.
# adjust with log base of 10
hist(log10(loansData$Inquiries.in.the.Last.6.Months + 1), col = "blue")
# Loan length characteristics
table(loansData$Loan.Length)
barplot(table(loansData$Loan.Length))
# We see that the number of loans for 36 months is more than twice as big as the number of loans for 60 months
# find out if there are any difference in interest rate for these two categories
length36 <- subset(loansData, as.character(loansData$Loan.Length) == "36 months")
length60 <- subset(loansData, as.character(loansData$Loan.Length) == "60 months")
summary(length36$Interest.Rate)
summary(length60$Interest.Rate)
boxplot(loansData$Interest.Rate ~ loansData$Loan.Length, varwidth = T)
# There seems to be a considerable difference between these two categories in terms of interest rate
# with interest rate for 35 months loans generally lower than that of 60 months loans
# See if it is somehow associated with amount of loan requested.
summary(length36$Amount.Requested)
summary(length60$Amount.Requested)
## Modeling ##
# Test the basic FICO score model
model0 <- lm(loansData$Interest.Rate ~ loansData$FICO.Mean)
summary(model0) # see the summary
confint(model0) # using confidence intervals for accuracy
plot(loansData$Interest.Rate ~ loansData$FICO.Mean)
# Strong negative correlation
# Check amount requested model
model1 <- lm(loansData$Interest.Rate ~ loansData$Amount.Requested)
summary(model1)
confint(model1)
cor(loansData$Interest.Rate, loansData$Amount.Requested)
# Shows some positive correlation of 0.3, which is significant, judging by low p-value and
# the fact that confidence intervals do not include 0. About 11% of variance explained
# Add recent inquiries
model2 <- lm(loansData$Interest.Rate ~ loansData$Amount.Requested + loansData$Inquiries.in.the.Last.6.Months)
summary(model2)
confint(model2)
# This has slightly improved our prediction model. Now we have about 14% of variance explained
# Add open credit lines
model3 <- lm(loansData$Interest.Rate ~ loansData$Amount.Requested + loansData$Inquiries.in.the.Last.6.Months +
loansData$Open.CREDIT.Lines)
summary(model3)
confint(model3)
# Doesn't seem to show any difference
model3a <- lm(loansData$Interest.Rate ~ loansData$Open.CREDIT.Lines)
summary(model3a)
# As doesn't this predictor show taken separately So we are not going to use it in our further modeling
# Revolving credit balance
model4 <- lm(loansData$Interest.Rate ~ loansData$Amount.Requested + loansData$Inquiries.in.the.Last.6.Months +
loansData$Revolving.CREDIT.Balance)
summary(model4)
# This doesn't make any particular difference either
# Add the loan purpose as a predictor
# As it is a categorical variable, use dummy coding to include it into the regression equation
(codePurpose) <- C(loansData$Loan.Purpose, treatment)
model5 <- lm(loansData$Interest.Rate ~ loansData$Amount.Requested + loansData$Inquiries.in.the.Last.6.Months +
(codePurpose))
summary(model5)
# Despite the extremes in this category, adding it to model doesn't make any particular difference
## Adjusting the model ##
# Our best model so far seems to be model 2
# Add the predicted scores to the data frame in order to visualize it in a two-dimensional plot
loansData$Interest.Rate.Predicted <- fitted(model2)
# Taking residuals into account
loansData$Residuals <- resid(model2)
hist(loansData$Residuals)
plot(loansData$Interest.Rate.Predicted ~ loansData$Residuals)
abline(lm(loansData$Interest.Rate.Predicted ~ loansData$Residuals), col="blue")
# Distribution of the residuals is normal and they show no correlation with predicted scores,
# which indicates that the model is trustworthy.
## Figure making ##
loansData$Length.Color <- gsub("36 months", 4, loansData$Loan.Length)
loansData$Length.Color <- gsub("60 months", 2, loansData$Length.Color)
plot(loansData$Interest.Rate ~ loansData$Interest.Rate.Predicted, pch = 20,
col = loansData$Length.Color,
main = "Income rate vs. amount requested and recent credit inquiries",
xlab = "Predictors combined", ylab = "Interest rate")
legend("bottomright",legend=c("36 months","60 months"),col=c("blue","red"),pch=c(20,20))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment