conorbmurphy/gist:9f9045a233d4249870d5

## gistfile1.txt
# load required packages
library(caret)
library(pROC)
library(e1071)

blood <- read.csv(file.choose())
names(blood) <- c("Donor","Recency","Frequency","Monetary","Times","Donated")

t


# Creates a data frame for the quantiles of rows 2:5 in blood
bloodQuant <- data.frame()
for (i in 1:5) {
  for(j in 2:5) {
    bloodQuant[j-1,i] <- quantile(blood[,j])[i]
    }
}
names(bloodQuant) <- c(0, 25, 50, 75, 100)

# Next steps:
# For recency: give score 5 for between 0-25, 4 for between 25-50, etc
# For other variables: give score 1 for bewteen 0-25, 2 for between 25-50, etc

#rfm scoring code. need to update.
ifelse(col>=16,r_score<-1,ifelse(col>=11,r_score<-2,ifelse(col>=4,r_score<-3,ifelse(col>=2,r_score<-4,r_score<-5))))}


p <- 1/score
q <- quantile(df$col,probs=seq(0,1,by=p))


a <- quantile(blood[,2], probs = seq(0,1,by=P))
b <- quantile(blood[,3], probs = seq(0,1,by=P))
c <- quantile(blood[,4], probs = seq(0,1,by=P))
d <- quantile(blood[,5], probs = seq(0,1,by=P))
quants <- rbind(a, b, c, d)


#bar plot of donations by RFM Score to visualize the counts of 1 and 0 in each RFM group
counts <- table(blood$donated, blood$group)
barplot(counts,main="Donated in March by RFM Score",xlab="RFM Score",col=c("green","red"),legend=rownames(counts),beside=T)

# create column showing whether they gave blood in March as yes and no
# flag it is as a factor
blood$donated2 <- ifelse(blood$donated==1,'yes','no')
blood$donated2 <- as.factor(blood$donated2)
outcomeName <- 'donated2'

# split test data into portion for training model and for validating the model
set.seed(700)
trainInd <- createDataPartition(blood[,outcomeName],p=.7,list=F,times=1) # .7 refers to 70% partition,
blood.train <- blood[trainInd,]
blood.test <- blood[-trainInd,]

# Set variables for training and creating model. below is the 'generalized boosted regression model' method.
objControl <- trainControl(method='cv',number=3,returnResamp='none',summaryFunction=twoClassSummary,classProbs=TRUE)
blood.fit <- train(blood.train[,1:5],blood.train[,outcomeName],method="gbm",trControl=objControl,metric="ROC",preProc = c("center","scale"))

#Also tried knn and neural network methods.
blood.fit <- train(blood.train[,1:5],blood.train[,outcomeName],method="knn",preProc = c("center","scale"),tuneLength=10,trControl=trainControl(method="cv"))
blood.fit <- train(blood.train[,1:5],blood.train[,outcomeName],method="nnet",preProc = "range",tuneLength=10,trace=F,maxit=1000)
#tuneLength refers to the numer of nodes, trace = TRUE will run through the learning process, maxit is the max number of iteratons


# review results
summary(blood.fit)
print(blood.fit)

# test the trained model against validation data and review results
predictions <- predict(object=blood.fit,blood.test[,1:5],type="raw")
head(predictions)
print(postResample(pred=predictions,obs=as.factor(blood.test[,outcomeName])))
predictions <- predict(object=blood.fit,blood.test[,1:5],type="prob")
head(predictions)

# review area under curve to see accuracy of prediction
auc <- roc(ifelse(blood.test[,outcomeName]=="yes",1,0),predictions[[2]])
auc$auc # This shows accuracy of the model

# plot importance of variables in model
var <- varImp(blood.fit)
plot(var)


df <- read.csv(file.choose())
dftest <- as.data.frame(cbind(df[,2], df[,3], df[,5], df[,6]))
names(dftest) <- c("Recency", "Frequency", "Time", "Monetary")


scoring <- function (df,column,r=5){

#get the length of rows of df
len <- dim(df)[1]

score <- rep(0,times=len)

# get the quantity of rows per 1/r e.g. 1/5
nr <- round(len / r)
if (nr > 0){

	# seperate the rows by r aliquots
	rStart <-0
	rEnd <- 0
	for (i in 1:r){

		#set the start row number and end row number
		rStart = rEnd+1

		#skip one "i" if the rStart is already in the i+1 or i+2 or ...scope.
		if (rStart> i*nr) next

		if (i == r){
			if(rStart<=len ) rEnd <- len else next
		}else{
			rEnd <- i*nr
		}

		# set the Recency score
		score[rStart:rEnd]<- r-i+1

		# make sure the customer who have the same recency have the same score
		s <- rEnd+1
		if(i<r & s <= len){
			for(u in s: len){
				if(df[rEnd,column]==df[u,column]){
					score[u]<- r-i+1
					rEnd <- u
				}else{
					break;
				}
			}

		}

	}

}
	return(score)

} #end of function Scoring

getIndependentScore <- function(df,r=5,f=5,m=5) {

if (r<=0 || f<=0 || m<=0) return

#order and the score
df <- df[order(df$Recency,-df$Frequency,-df$Monetary),]
R_Score <- scoring(df,"Recency",r)
df <- cbind(df, R_Score)

df <- df[order(-df$Frequency,df$Recency,-df$Monetary),]
F_Score <- scoring(df,"Frequency",f)
df <- cbind(df, F_Score)

df <- df[order(-df$Monetary,df$Recency,-df$Frequency),]
M_Score <- scoring(df,"Monetary",m)
df <- cbind(df, M_Score)

#order the dataframe by R_Score, F_Score, and M_Score desc
df <- df[order(-df$R_Score,-df$F_Score,-df$M_Score),]

# caculate the total score
Total_Score <- c(100*df$R_Score + 10*df$F_Score+df$M_Score)

df <- cbind(df,Total_Score)

return (df)

} # end of function getIndependentScore

test_score <- getIndependentScore(df)

drawHistograms <- function(df,r=5,f=5,m=5){

#set the layout plot window
par(mfrow = c(f,r))

names <-rep("",times=m)
for(i in 1:m) names[i]<-paste("M",i)


for (i in 1:f){
	for (j in 1:r){
		c <- rep(0,times=m)
		for(k in 1:m){
			tmpdf <-df[df$R_Score==j & df$F_Score==i & df$M_Score==k,]
			c[k]<- dim(tmpdf)[1]

		}
		if (i==1 & j==1)
			barplot(c,col="lightblue",names.arg=names)
		else
			barplot(c,col="lightblue")
		if (j==1) title(ylab=paste("F",i))
		if (i==1) title(main=paste("R",j))

	}

}

par(mfrow = c(1,1))

} # end of drawHistograms function

drawHistograms(test_score)

# from http://www.dataapple.net/?p=84
	# load required packages
	library(caret)
	library(pROC)
	library(e1071)

	blood <- read.csv(file.choose())
	names(blood) <- c("Donor","Recency","Frequency","Monetary","Times","Donated")

	t



	# Creates a data frame for the quantiles of rows 2:5 in blood
	bloodQuant <- data.frame()
	for (i in 1:5) {
	for(j in 2:5) {
	bloodQuant[j-1,i] <- quantile(blood[,j])[i]
	}
	}
	names(bloodQuant) <- c(0, 25, 50, 75, 100)

	# Next steps:
	# For recency: give score 5 for between 0-25, 4 for between 25-50, etc
	# For other variables: give score 1 for bewteen 0-25, 2 for between 25-50, etc

	#rfm scoring code. need to update.
	ifelse(col>=16,r_score<-1,ifelse(col>=11,r_score<-2,ifelse(col>=4,r_score<-3,ifelse(col>=2,r_score<-4,r_score<-5))))}


	p <- 1/score
	q <- quantile(df$col,probs=seq(0,1,by=p))



	a <- quantile(blood[,2], probs = seq(0,1,by=P))
	b <- quantile(blood[,3], probs = seq(0,1,by=P))
	c <- quantile(blood[,4], probs = seq(0,1,by=P))
	d <- quantile(blood[,5], probs = seq(0,1,by=P))
	quants <- rbind(a, b, c, d)






	#bar plot of donations by RFM Score to visualize the counts of 1 and 0 in each RFM group
	counts <- table(blood$donated, blood$group)
	barplot(counts,main="Donated in March by RFM Score",xlab="RFM Score",col=c("green","red"),legend=rownames(counts),beside=T)

	# create column showing whether they gave blood in March as yes and no
	# flag it is as a factor
	blood$donated2 <- ifelse(blood$donated==1,'yes','no')
	blood$donated2 <- as.factor(blood$donated2)
	outcomeName <- 'donated2'

	# split test data into portion for training model and for validating the model
	set.seed(700)
	trainInd <- createDataPartition(blood[,outcomeName],p=.7,list=F,times=1) # .7 refers to 70% partition,
	blood.train <- blood[trainInd,]
	blood.test <- blood[-trainInd,]

	# Set variables for training and creating model. below is the 'generalized boosted regression model' method.
	objControl <- trainControl(method='cv',number=3,returnResamp='none',summaryFunction=twoClassSummary,classProbs=TRUE)
	blood.fit <- train(blood.train[,1:5],blood.train[,outcomeName],method="gbm",trControl=objControl,metric="ROC",preProc = c("center","scale"))

	#Also tried knn and neural network methods.
	blood.fit <- train(blood.train[,1:5],blood.train[,outcomeName],method="knn",preProc = c("center","scale"),tuneLength=10,trControl=trainControl(method="cv"))
	blood.fit <- train(blood.train[,1:5],blood.train[,outcomeName],method="nnet",preProc = "range",tuneLength=10,trace=F,maxit=1000)
	#tuneLength refers to the numer of nodes, trace = TRUE will run through the learning process, maxit is the max number of iteratons


	# review results
	summary(blood.fit)
	print(blood.fit)

	# test the trained model against validation data and review results
	predictions <- predict(object=blood.fit,blood.test[,1:5],type="raw")
	head(predictions)
	print(postResample(pred=predictions,obs=as.factor(blood.test[,outcomeName])))
	predictions <- predict(object=blood.fit,blood.test[,1:5],type="prob")
	head(predictions)

	# review area under curve to see accuracy of prediction
	auc <- roc(ifelse(blood.test[,outcomeName]=="yes",1,0),predictions[[2]])
	auc$auc # This shows accuracy of the model

	# plot importance of variables in model
	var <- varImp(blood.fit)
	plot(var)




	df <- read.csv(file.choose())
	dftest <- as.data.frame(cbind(df[,2], df[,3], df[,5], df[,6]))
	names(dftest) <- c("Recency", "Frequency", "Time", "Monetary")


	scoring <- function (df,column,r=5){

	#get the length of rows of df
	len <- dim(df)[1]

	score <- rep(0,times=len)

	# get the quantity of rows per 1/r e.g. 1/5
	nr <- round(len / r)
	if (nr > 0){

	# seperate the rows by r aliquots
	rStart <-0
	rEnd <- 0
	for (i in 1:r){

	#set the start row number and end row number
	rStart = rEnd+1

	#skip one "i" if the rStart is already in the i+1 or i+2 or ...scope.
	if (rStart> i*nr) next

	if (i == r){
	if(rStart<=len ) rEnd <- len else next
	}else{
	rEnd <- i*nr
	}

	# set the Recency score
	score[rStart:rEnd]<- r-i+1

	# make sure the customer who have the same recency have the same score
	s <- rEnd+1
	if(i<r & s <= len){
	for(u in s: len){
	if(df[rEnd,column]==df[u,column]){
	score[u]<- r-i+1
	rEnd <- u
	}else{
	break;
	}
	}

	}

	}

	}
	return(score)

	} #end of function Scoring

	getIndependentScore <- function(df,r=5,f=5,m=5) {

	if (r<=0 \|\| f<=0 \|\| m<=0) return

	#order and the score
	df <- df[order(df$Recency,-df$Frequency,-df$Monetary),]
	R_Score <- scoring(df,"Recency",r)
	df <- cbind(df, R_Score)

	df <- df[order(-df$Frequency,df$Recency,-df$Monetary),]
	F_Score <- scoring(df,"Frequency",f)
	df <- cbind(df, F_Score)

	df <- df[order(-df$Monetary,df$Recency,-df$Frequency),]
	M_Score <- scoring(df,"Monetary",m)
	df <- cbind(df, M_Score)

	#order the dataframe by R_Score, F_Score, and M_Score desc
	df <- df[order(-df$R_Score,-df$F_Score,-df$M_Score),]

	# caculate the total score
	Total_Score <- c(100df$R_Score + 10df$F_Score+df$M_Score)

	df <- cbind(df,Total_Score)

	return (df)

	} # end of function getIndependentScore

	test_score <- getIndependentScore(df)

	drawHistograms <- function(df,r=5,f=5,m=5){

	#set the layout plot window
	par(mfrow = c(f,r))

	names <-rep("",times=m)
	for(i in 1:m) names[i]<-paste("M",i)


	for (i in 1:f){
	for (j in 1:r){
	c <- rep(0,times=m)
	for(k in 1:m){
	tmpdf <-df[df$R_Score==j & df$F_Score==i & df$M_Score==k,]
	c[k]<- dim(tmpdf)[1]

	}
	if (i==1 & j==1)
	barplot(c,col="lightblue",names.arg=names)
	else
	barplot(c,col="lightblue")
	if (j==1) title(ylab=paste("F",i))
	if (i==1) title(main=paste("R",j))

	}

	}

	par(mfrow = c(1,1))

	} # end of drawHistograms function

	drawHistograms(test_score)

	# from http://www.dataapple.net/?p=84