ansakoy/DAassignment1.R

## DAassignment1.R
### Final Lending Club Analysis###

## This analysis was performed using Windows 7 and RStudio version 3.0.1 ##

# Download data
fileUrl <- "http://spark-public.s3.amazonaws.com/dataanalysis/loansData.csv"
download.file(fileUrl, destfile = "loansData.csv")

# Read data into R
loansData <- read.csv("loansData.csv")

## Reshaping data ##
# Converting values from character to numeric where necessary
loansData$Interest.Rate <- as.numeric(strsplit(as.character(loansData$Interest.Rate), "\\%"))
loansData$Debt.To.Income.Ratio <- as.numeric(strsplit(as.character(loansData$Debt.To.Income.Ratio), "\\%"))
# Add a new column with FICO range means
minFICO <- as.numeric(lapply(strsplit(as.character(loansData$FICO.Range), "-"), "[",1))
maxFICO <- as.numeric(lapply(strsplit(as.character(loansData$FICO.Range), "-"), "[",2))
loansData$FICO.Mean <- (minFICO + maxFICO) / 2


## Exploratory analysis ##

# Find the number of missing values
sum(is.na(loansData))

# These values are actually concentrated in two observations:
obsedvationsNA <- loansData[c(367, 1595),]
# As these observations do not contain any particular extremes, we shall exclude them from our future analysis
loansData <- na.omit(loansData)

# Look at the summary statistics of interest rate
summary(loansData$Interest.Rate)
# mean value is at roughly 13% rate and minimum and maximum values at 5,4% and 24,9%
hist(loansData$Interest.Rate, col = "blue")
# The distribution looks pretty normal

# Look at FICO score distribution
hist(loansData$FICO.Mean, col = "blue")
# The distribution is strongly right-skewed, which means that within this dataset borrowers tend
# to have rather low credit score

# Find out the distribution of observations by loan purpose
barplot(table(loansData$Loan.Purpose))
# The outstandingly popular purpose (for 1307 observations) is debt consolidation followed by
# loans for getting a credit card
# Check if there is any difference in interest rate
debtCons <- subset(loansData, loansData$Loan.Purpose == "debt_consolidation")
summary(debtCons$Interest.Rate)
# No particular difference

# Find out monthly income distribution and summary
summary(loansData$Monthly.Income)
hist(loansData$Monthly.Income, col = "blue")
# The distribution is extremely right-skewed
# Adjust the outliers
incomeAdjusted <- loansData$Monthly.Income <= 30000
summary(loansData$Monthly.Income[incomeAdjusted])
hist(loansData$Monthly.Income[incomeAdjusted], col = "blue")
# Here we see that the distribution is still right-skewed with its peak around $5000 per month

# Look at the summary and distribution of open credit lines
summary(loansData$Open.CREDIT.Lines)
hist(loansData$Open.CREDIT.Lines, col = "blue")
# The distribution is also right-skewed with most observations falling within a range between 5 and 13.
# The longer tale is created due to outliers. Adjusted:
hist(log10(loansData$Open.CREDIT.Lines + 1), col = "blue")
hist(loansData$Open.CREDIT.Lines[loansData$Open.CREDIT.Lines <= 20 & loansData$Open.CREDIT.Lines >= 1], col = "blue")

# Revolving credit balance summary and distribution
summary(loansData$Revolving.CREDIT.Balance)
hist(loansData$Revolving.CREDIT.Balance, col = "blue")
# Right-skewed distribution with a mean around 10000

# Inquiries in the Last 6 Months: summary and distribution
summary(loansData$Inquiries.in.the.Last.6.Months)
hist(loansData$Inquiries.in.the.Last.6.Months, col = "blue")
# Right-skewed distribution with over a half of observations have 0 values.
# adjust with log base of 10
hist(log10(loansData$Inquiries.in.the.Last.6.Months + 1), col = "blue")

# Loan length characteristics
table(loansData$Loan.Length)
barplot(table(loansData$Loan.Length))
# We see that the number of loans for 36 months is more than twice as big as the number of loans for 60 months
# find out if there are any difference in interest rate for these two categories
length36 <- subset(loansData, as.character(loansData$Loan.Length) == "36 months")
length60 <- subset(loansData, as.character(loansData$Loan.Length) == "60 months")
summary(length36$Interest.Rate)
summary(length60$Interest.Rate)
boxplot(loansData$Interest.Rate ~ loansData$Loan.Length, varwidth = T)
# There seems to be a considerable difference between these two categories in terms of interest rate
# with interest rate for 35 months loans generally lower than that of 60 months loans

# See if it is somehow associated with amount of loan requested.
summary(length36$Amount.Requested)
summary(length60$Amount.Requested)


## Modeling ##

# Test the basic FICO score model
model0 <- lm(loansData$Interest.Rate ~ loansData$FICO.Mean)
summary(model0) # see the summary
confint(model0) # using confidence intervals for accuracy
plot(loansData$Interest.Rate ~ loansData$FICO.Mean)
# Strong negative correlation

# Check amount requested model
model1 <- lm(loansData$Interest.Rate ~ loansData$Amount.Requested)
summary(model1)
confint(model1)
cor(loansData$Interest.Rate, loansData$Amount.Requested)
# Shows some positive correlation of 0.3, which is significant, judging by low p-value and
# the fact that confidence intervals do not include 0. About 11% of variance explained

# Add recent inquiries
model2 <- lm(loansData$Interest.Rate ~ loansData$Amount.Requested + loansData$Inquiries.in.the.Last.6.Months)
summary(model2)
confint(model2)
# This has slightly improved our prediction model. Now we have about 14% of variance explained

# Add open credit lines
model3 <- lm(loansData$Interest.Rate ~ loansData$Amount.Requested + loansData$Inquiries.in.the.Last.6.Months +
               loansData$Open.CREDIT.Lines)
summary(model3)
confint(model3)
# Doesn't seem to show any difference
model3a <- lm(loansData$Interest.Rate ~ loansData$Open.CREDIT.Lines)
summary(model3a)
# As doesn't this predictor show taken separately So we are not going to use it in our further modeling

# Revolving credit balance
model4 <- lm(loansData$Interest.Rate ~ loansData$Amount.Requested + loansData$Inquiries.in.the.Last.6.Months +
               loansData$Revolving.CREDIT.Balance)
summary(model4)
# This doesn't make any particular difference either

# Add the loan purpose as a predictor
# As it is a categorical variable, use dummy coding to include it into the regression equation
(codePurpose) <- C(loansData$Loan.Purpose, treatment)
model5 <- lm(loansData$Interest.Rate ~ loansData$Amount.Requested + loansData$Inquiries.in.the.Last.6.Months +
               (codePurpose))
summary(model5)
# Despite the extremes in this category, adding it to model doesn't make any particular difference

## Adjusting the model ##

# Our best model so far seems to be model 2
# Add the predicted scores to the data frame in order to visualize it in a two-dimensional plot
loansData$Interest.Rate.Predicted <- fitted(model2)

# Taking residuals into account
loansData$Residuals <- resid(model2)
hist(loansData$Residuals)
plot(loansData$Interest.Rate.Predicted ~ loansData$Residuals)
abline(lm(loansData$Interest.Rate.Predicted ~ loansData$Residuals), col="blue")

# Distribution of the residuals is normal and they show no correlation with predicted scores,
# which indicates that the model is trustworthy.

## Figure making ##
loansData$Length.Color <- gsub("36 months", 4, loansData$Loan.Length)
loansData$Length.Color <- gsub("60 months", 2, loansData$Length.Color)


plot(loansData$Interest.Rate ~ loansData$Interest.Rate.Predicted, pch = 20,
     col = loansData$Length.Color,
     main = "Income rate vs. amount requested and recent credit inquiries",
     xlab = "Predictors combined", ylab = "Interest rate")
legend("bottomright",legend=c("36 months","60 months"),col=c("blue","red"),pch=c(20,20))
	### Final Lending Club Analysis###

	## This analysis was performed using Windows 7 and RStudio version 3.0.1 ##

	# Download data
	fileUrl <- "http://spark-public.s3.amazonaws.com/dataanalysis/loansData.csv"
	download.file(fileUrl, destfile = "loansData.csv")

	# Read data into R
	loansData <- read.csv("loansData.csv")

	## Reshaping data ##
	# Converting values from character to numeric where necessary
	loansData$Interest.Rate <- as.numeric(strsplit(as.character(loansData$Interest.Rate), "\\%"))
	loansData$Debt.To.Income.Ratio <- as.numeric(strsplit(as.character(loansData$Debt.To.Income.Ratio), "\\%"))
	# Add a new column with FICO range means
	minFICO <- as.numeric(lapply(strsplit(as.character(loansData$FICO.Range), "-"), "[",1))
	maxFICO <- as.numeric(lapply(strsplit(as.character(loansData$FICO.Range), "-"), "[",2))
	loansData$FICO.Mean <- (minFICO + maxFICO) / 2


	## Exploratory analysis ##

	# Find the number of missing values
	sum(is.na(loansData))

	# These values are actually concentrated in two observations:
	obsedvationsNA <- loansData[c(367, 1595),]
	# As these observations do not contain any particular extremes, we shall exclude them from our future analysis
	loansData <- na.omit(loansData)

	# Look at the summary statistics of interest rate
	summary(loansData$Interest.Rate)
	# mean value is at roughly 13% rate and minimum and maximum values at 5,4% and 24,9%
	hist(loansData$Interest.Rate, col = "blue")
	# The distribution looks pretty normal

	# Look at FICO score distribution
	hist(loansData$FICO.Mean, col = "blue")
	# The distribution is strongly right-skewed, which means that within this dataset borrowers tend
	# to have rather low credit score

	# Find out the distribution of observations by loan purpose
	barplot(table(loansData$Loan.Purpose))
	# The outstandingly popular purpose (for 1307 observations) is debt consolidation followed by
	# loans for getting a credit card
	# Check if there is any difference in interest rate
	debtCons <- subset(loansData, loansData$Loan.Purpose == "debt_consolidation")
	summary(debtCons$Interest.Rate)
	# No particular difference

	# Find out monthly income distribution and summary
	summary(loansData$Monthly.Income)
	hist(loansData$Monthly.Income, col = "blue")
	# The distribution is extremely right-skewed
	# Adjust the outliers
	incomeAdjusted <- loansData$Monthly.Income <= 30000
	summary(loansData$Monthly.Income[incomeAdjusted])
	hist(loansData$Monthly.Income[incomeAdjusted], col = "blue")
	# Here we see that the distribution is still right-skewed with its peak around $5000 per month

	# Look at the summary and distribution of open credit lines
	summary(loansData$Open.CREDIT.Lines)
	hist(loansData$Open.CREDIT.Lines, col = "blue")
	# The distribution is also right-skewed with most observations falling within a range between 5 and 13.
	# The longer tale is created due to outliers. Adjusted:
	hist(log10(loansData$Open.CREDIT.Lines + 1), col = "blue")
	hist(loansData$Open.CREDIT.Lines[loansData$Open.CREDIT.Lines <= 20 & loansData$Open.CREDIT.Lines >= 1], col = "blue")

	# Revolving credit balance summary and distribution
	summary(loansData$Revolving.CREDIT.Balance)
	hist(loansData$Revolving.CREDIT.Balance, col = "blue")
	# Right-skewed distribution with a mean around 10000

	# Inquiries in the Last 6 Months: summary and distribution
	summary(loansData$Inquiries.in.the.Last.6.Months)
	hist(loansData$Inquiries.in.the.Last.6.Months, col = "blue")
	# Right-skewed distribution with over a half of observations have 0 values.
	# adjust with log base of 10
	hist(log10(loansData$Inquiries.in.the.Last.6.Months + 1), col = "blue")

	# Loan length characteristics
	table(loansData$Loan.Length)
	barplot(table(loansData$Loan.Length))
	# We see that the number of loans for 36 months is more than twice as big as the number of loans for 60 months
	# find out if there are any difference in interest rate for these two categories
	length36 <- subset(loansData, as.character(loansData$Loan.Length) == "36 months")
	length60 <- subset(loansData, as.character(loansData$Loan.Length) == "60 months")
	summary(length36$Interest.Rate)
	summary(length60$Interest.Rate)
	boxplot(loansData$Interest.Rate ~ loansData$Loan.Length, varwidth = T)
	# There seems to be a considerable difference between these two categories in terms of interest rate
	# with interest rate for 35 months loans generally lower than that of 60 months loans

	# See if it is somehow associated with amount of loan requested.
	summary(length36$Amount.Requested)
	summary(length60$Amount.Requested)


	## Modeling ##

	# Test the basic FICO score model
	model0 <- lm(loansData$Interest.Rate ~ loansData$FICO.Mean)
	summary(model0) # see the summary
	confint(model0) # using confidence intervals for accuracy
	plot(loansData$Interest.Rate ~ loansData$FICO.Mean)
	# Strong negative correlation

	# Check amount requested model
	model1 <- lm(loansData$Interest.Rate ~ loansData$Amount.Requested)
	summary(model1)
	confint(model1)
	cor(loansData$Interest.Rate, loansData$Amount.Requested)
	# Shows some positive correlation of 0.3, which is significant, judging by low p-value and
	# the fact that confidence intervals do not include 0. About 11% of variance explained

	# Add recent inquiries
	model2 <- lm(loansData$Interest.Rate ~ loansData$Amount.Requested + loansData$Inquiries.in.the.Last.6.Months)
	summary(model2)
	confint(model2)
	# This has slightly improved our prediction model. Now we have about 14% of variance explained

	# Add open credit lines
	model3 <- lm(loansData$Interest.Rate ~ loansData$Amount.Requested + loansData$Inquiries.in.the.Last.6.Months +
	loansData$Open.CREDIT.Lines)
	summary(model3)
	confint(model3)
	# Doesn't seem to show any difference
	model3a <- lm(loansData$Interest.Rate ~ loansData$Open.CREDIT.Lines)
	summary(model3a)
	# As doesn't this predictor show taken separately So we are not going to use it in our further modeling

	# Revolving credit balance
	model4 <- lm(loansData$Interest.Rate ~ loansData$Amount.Requested + loansData$Inquiries.in.the.Last.6.Months +
	loansData$Revolving.CREDIT.Balance)
	summary(model4)
	# This doesn't make any particular difference either

	# Add the loan purpose as a predictor
	# As it is a categorical variable, use dummy coding to include it into the regression equation
	(codePurpose) <- C(loansData$Loan.Purpose, treatment)
	model5 <- lm(loansData$Interest.Rate ~ loansData$Amount.Requested + loansData$Inquiries.in.the.Last.6.Months +
	(codePurpose))
	summary(model5)
	# Despite the extremes in this category, adding it to model doesn't make any particular difference

	## Adjusting the model ##

	# Our best model so far seems to be model 2
	# Add the predicted scores to the data frame in order to visualize it in a two-dimensional plot
	loansData$Interest.Rate.Predicted <- fitted(model2)

	# Taking residuals into account
	loansData$Residuals <- resid(model2)
	hist(loansData$Residuals)
	plot(loansData$Interest.Rate.Predicted ~ loansData$Residuals)
	abline(lm(loansData$Interest.Rate.Predicted ~ loansData$Residuals), col="blue")

	# Distribution of the residuals is normal and they show no correlation with predicted scores,
	# which indicates that the model is trustworthy.

	## Figure making ##
	loansData$Length.Color <- gsub("36 months", 4, loansData$Loan.Length)
	loansData$Length.Color <- gsub("60 months", 2, loansData$Length.Color)


	plot(loansData$Interest.Rate ~ loansData$Interest.Rate.Predicted, pch = 20,
	col = loansData$Length.Color,
	main = "Income rate vs. amount requested and recent credit inquiries",
	xlab = "Predictors combined", ylab = "Interest rate")
	legend("bottomright",legend=c("36 months","60 months"),col=c("blue","red"),pch=c(20,20))