Last active
February 11, 2016 19:01
-
-
Save conorbmurphy/9f9045a233d4249870d5 to your computer and use it in GitHub Desktop.
RFM Draft for Blood Donations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load required packages | |
library(caret) | |
library(pROC) | |
library(e1071) | |
blood <- read.csv(file.choose()) | |
names(blood) <- c("Donor","Recency","Frequency","Monetary","Times","Donated") | |
t | |
# Creates a data frame for the quantiles of rows 2:5 in blood | |
bloodQuant <- data.frame() | |
for (i in 1:5) { | |
for(j in 2:5) { | |
bloodQuant[j-1,i] <- quantile(blood[,j])[i] | |
} | |
} | |
names(bloodQuant) <- c(0, 25, 50, 75, 100) | |
# Next steps: | |
# For recency: give score 5 for between 0-25, 4 for between 25-50, etc | |
# For other variables: give score 1 for bewteen 0-25, 2 for between 25-50, etc | |
#rfm scoring code. need to update. | |
ifelse(col>=16,r_score<-1,ifelse(col>=11,r_score<-2,ifelse(col>=4,r_score<-3,ifelse(col>=2,r_score<-4,r_score<-5))))} | |
p <- 1/score | |
q <- quantile(df$col,probs=seq(0,1,by=p)) | |
a <- quantile(blood[,2], probs = seq(0,1,by=P)) | |
b <- quantile(blood[,3], probs = seq(0,1,by=P)) | |
c <- quantile(blood[,4], probs = seq(0,1,by=P)) | |
d <- quantile(blood[,5], probs = seq(0,1,by=P)) | |
quants <- rbind(a, b, c, d) | |
#bar plot of donations by RFM Score to visualize the counts of 1 and 0 in each RFM group | |
counts <- table(blood$donated, blood$group) | |
barplot(counts,main="Donated in March by RFM Score",xlab="RFM Score",col=c("green","red"),legend=rownames(counts),beside=T) | |
# create column showing whether they gave blood in March as yes and no | |
# flag it is as a factor | |
blood$donated2 <- ifelse(blood$donated==1,'yes','no') | |
blood$donated2 <- as.factor(blood$donated2) | |
outcomeName <- 'donated2' | |
# split test data into portion for training model and for validating the model | |
set.seed(700) | |
trainInd <- createDataPartition(blood[,outcomeName],p=.7,list=F,times=1) # .7 refers to 70% partition, | |
blood.train <- blood[trainInd,] | |
blood.test <- blood[-trainInd,] | |
# Set variables for training and creating model. below is the 'generalized boosted regression model' method. | |
objControl <- trainControl(method='cv',number=3,returnResamp='none',summaryFunction=twoClassSummary,classProbs=TRUE) | |
blood.fit <- train(blood.train[,1:5],blood.train[,outcomeName],method="gbm",trControl=objControl,metric="ROC",preProc = c("center","scale")) | |
#Also tried knn and neural network methods. | |
blood.fit <- train(blood.train[,1:5],blood.train[,outcomeName],method="knn",preProc = c("center","scale"),tuneLength=10,trControl=trainControl(method="cv")) | |
blood.fit <- train(blood.train[,1:5],blood.train[,outcomeName],method="nnet",preProc = "range",tuneLength=10,trace=F,maxit=1000) | |
#tuneLength refers to the numer of nodes, trace = TRUE will run through the learning process, maxit is the max number of iteratons | |
# review results | |
summary(blood.fit) | |
print(blood.fit) | |
# test the trained model against validation data and review results | |
predictions <- predict(object=blood.fit,blood.test[,1:5],type="raw") | |
head(predictions) | |
print(postResample(pred=predictions,obs=as.factor(blood.test[,outcomeName]))) | |
predictions <- predict(object=blood.fit,blood.test[,1:5],type="prob") | |
head(predictions) | |
# review area under curve to see accuracy of prediction | |
auc <- roc(ifelse(blood.test[,outcomeName]=="yes",1,0),predictions[[2]]) | |
auc$auc # This shows accuracy of the model | |
# plot importance of variables in model | |
var <- varImp(blood.fit) | |
plot(var) | |
df <- read.csv(file.choose()) | |
dftest <- as.data.frame(cbind(df[,2], df[,3], df[,5], df[,6])) | |
names(dftest) <- c("Recency", "Frequency", "Time", "Monetary") | |
scoring <- function (df,column,r=5){ | |
#get the length of rows of df | |
len <- dim(df)[1] | |
score <- rep(0,times=len) | |
# get the quantity of rows per 1/r e.g. 1/5 | |
nr <- round(len / r) | |
if (nr > 0){ | |
# seperate the rows by r aliquots | |
rStart <-0 | |
rEnd <- 0 | |
for (i in 1:r){ | |
#set the start row number and end row number | |
rStart = rEnd+1 | |
#skip one "i" if the rStart is already in the i+1 or i+2 or ...scope. | |
if (rStart> i*nr) next | |
if (i == r){ | |
if(rStart<=len ) rEnd <- len else next | |
}else{ | |
rEnd <- i*nr | |
} | |
# set the Recency score | |
score[rStart:rEnd]<- r-i+1 | |
# make sure the customer who have the same recency have the same score | |
s <- rEnd+1 | |
if(i<r & s <= len){ | |
for(u in s: len){ | |
if(df[rEnd,column]==df[u,column]){ | |
score[u]<- r-i+1 | |
rEnd <- u | |
}else{ | |
break; | |
} | |
} | |
} | |
} | |
} | |
return(score) | |
} #end of function Scoring | |
getIndependentScore <- function(df,r=5,f=5,m=5) { | |
if (r<=0 || f<=0 || m<=0) return | |
#order and the score | |
df <- df[order(df$Recency,-df$Frequency,-df$Monetary),] | |
R_Score <- scoring(df,"Recency",r) | |
df <- cbind(df, R_Score) | |
df <- df[order(-df$Frequency,df$Recency,-df$Monetary),] | |
F_Score <- scoring(df,"Frequency",f) | |
df <- cbind(df, F_Score) | |
df <- df[order(-df$Monetary,df$Recency,-df$Frequency),] | |
M_Score <- scoring(df,"Monetary",m) | |
df <- cbind(df, M_Score) | |
#order the dataframe by R_Score, F_Score, and M_Score desc | |
df <- df[order(-df$R_Score,-df$F_Score,-df$M_Score),] | |
# caculate the total score | |
Total_Score <- c(100*df$R_Score + 10*df$F_Score+df$M_Score) | |
df <- cbind(df,Total_Score) | |
return (df) | |
} # end of function getIndependentScore | |
test_score <- getIndependentScore(df) | |
drawHistograms <- function(df,r=5,f=5,m=5){ | |
#set the layout plot window | |
par(mfrow = c(f,r)) | |
names <-rep("",times=m) | |
for(i in 1:m) names[i]<-paste("M",i) | |
for (i in 1:f){ | |
for (j in 1:r){ | |
c <- rep(0,times=m) | |
for(k in 1:m){ | |
tmpdf <-df[df$R_Score==j & df$F_Score==i & df$M_Score==k,] | |
c[k]<- dim(tmpdf)[1] | |
} | |
if (i==1 & j==1) | |
barplot(c,col="lightblue",names.arg=names) | |
else | |
barplot(c,col="lightblue") | |
if (j==1) title(ylab=paste("F",i)) | |
if (i==1) title(main=paste("R",j)) | |
} | |
} | |
par(mfrow = c(1,1)) | |
} # end of drawHistograms function | |
drawHistograms(test_score) | |
# from http://www.dataapple.net/?p=84 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment