Skip to content

Instantly share code, notes, and snippets.

@conorbmurphy
Last active February 11, 2016 19:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save conorbmurphy/9f9045a233d4249870d5 to your computer and use it in GitHub Desktop.
Save conorbmurphy/9f9045a233d4249870d5 to your computer and use it in GitHub Desktop.
RFM Draft for Blood Donations
# load required packages
library(caret)
library(pROC)
library(e1071)
blood <- read.csv(file.choose())
names(blood) <- c("Donor","Recency","Frequency","Monetary","Times","Donated")
t
# Creates a data frame for the quantiles of rows 2:5 in blood
bloodQuant <- data.frame()
for (i in 1:5) {
for(j in 2:5) {
bloodQuant[j-1,i] <- quantile(blood[,j])[i]
}
}
names(bloodQuant) <- c(0, 25, 50, 75, 100)
# Next steps:
# For recency: give score 5 for between 0-25, 4 for between 25-50, etc
# For other variables: give score 1 for bewteen 0-25, 2 for between 25-50, etc
#rfm scoring code. need to update.
ifelse(col>=16,r_score<-1,ifelse(col>=11,r_score<-2,ifelse(col>=4,r_score<-3,ifelse(col>=2,r_score<-4,r_score<-5))))}
p <- 1/score
q <- quantile(df$col,probs=seq(0,1,by=p))
a <- quantile(blood[,2], probs = seq(0,1,by=P))
b <- quantile(blood[,3], probs = seq(0,1,by=P))
c <- quantile(blood[,4], probs = seq(0,1,by=P))
d <- quantile(blood[,5], probs = seq(0,1,by=P))
quants <- rbind(a, b, c, d)
#bar plot of donations by RFM Score to visualize the counts of 1 and 0 in each RFM group
counts <- table(blood$donated, blood$group)
barplot(counts,main="Donated in March by RFM Score",xlab="RFM Score",col=c("green","red"),legend=rownames(counts),beside=T)
# create column showing whether they gave blood in March as yes and no
# flag it is as a factor
blood$donated2 <- ifelse(blood$donated==1,'yes','no')
blood$donated2 <- as.factor(blood$donated2)
outcomeName <- 'donated2'
# split test data into portion for training model and for validating the model
set.seed(700)
trainInd <- createDataPartition(blood[,outcomeName],p=.7,list=F,times=1) # .7 refers to 70% partition,
blood.train <- blood[trainInd,]
blood.test <- blood[-trainInd,]
# Set variables for training and creating model. below is the 'generalized boosted regression model' method.
objControl <- trainControl(method='cv',number=3,returnResamp='none',summaryFunction=twoClassSummary,classProbs=TRUE)
blood.fit <- train(blood.train[,1:5],blood.train[,outcomeName],method="gbm",trControl=objControl,metric="ROC",preProc = c("center","scale"))
#Also tried knn and neural network methods.
blood.fit <- train(blood.train[,1:5],blood.train[,outcomeName],method="knn",preProc = c("center","scale"),tuneLength=10,trControl=trainControl(method="cv"))
blood.fit <- train(blood.train[,1:5],blood.train[,outcomeName],method="nnet",preProc = "range",tuneLength=10,trace=F,maxit=1000)
#tuneLength refers to the numer of nodes, trace = TRUE will run through the learning process, maxit is the max number of iteratons
# review results
summary(blood.fit)
print(blood.fit)
# test the trained model against validation data and review results
predictions <- predict(object=blood.fit,blood.test[,1:5],type="raw")
head(predictions)
print(postResample(pred=predictions,obs=as.factor(blood.test[,outcomeName])))
predictions <- predict(object=blood.fit,blood.test[,1:5],type="prob")
head(predictions)
# review area under curve to see accuracy of prediction
auc <- roc(ifelse(blood.test[,outcomeName]=="yes",1,0),predictions[[2]])
auc$auc # This shows accuracy of the model
# plot importance of variables in model
var <- varImp(blood.fit)
plot(var)
df <- read.csv(file.choose())
dftest <- as.data.frame(cbind(df[,2], df[,3], df[,5], df[,6]))
names(dftest) <- c("Recency", "Frequency", "Time", "Monetary")
scoring <- function (df,column,r=5){
#get the length of rows of df
len <- dim(df)[1]
score <- rep(0,times=len)
# get the quantity of rows per 1/r e.g. 1/5
nr <- round(len / r)
if (nr > 0){
# seperate the rows by r aliquots
rStart <-0
rEnd <- 0
for (i in 1:r){
#set the start row number and end row number
rStart = rEnd+1
#skip one "i" if the rStart is already in the i+1 or i+2 or ...scope.
if (rStart> i*nr) next
if (i == r){
if(rStart<=len ) rEnd <- len else next
}else{
rEnd <- i*nr
}
# set the Recency score
score[rStart:rEnd]<- r-i+1
# make sure the customer who have the same recency have the same score
s <- rEnd+1
if(i<r & s <= len){
for(u in s: len){
if(df[rEnd,column]==df[u,column]){
score[u]<- r-i+1
rEnd <- u
}else{
break;
}
}
}
}
}
return(score)
} #end of function Scoring
getIndependentScore <- function(df,r=5,f=5,m=5) {
if (r<=0 || f<=0 || m<=0) return
#order and the score
df <- df[order(df$Recency,-df$Frequency,-df$Monetary),]
R_Score <- scoring(df,"Recency",r)
df <- cbind(df, R_Score)
df <- df[order(-df$Frequency,df$Recency,-df$Monetary),]
F_Score <- scoring(df,"Frequency",f)
df <- cbind(df, F_Score)
df <- df[order(-df$Monetary,df$Recency,-df$Frequency),]
M_Score <- scoring(df,"Monetary",m)
df <- cbind(df, M_Score)
#order the dataframe by R_Score, F_Score, and M_Score desc
df <- df[order(-df$R_Score,-df$F_Score,-df$M_Score),]
# caculate the total score
Total_Score <- c(100*df$R_Score + 10*df$F_Score+df$M_Score)
df <- cbind(df,Total_Score)
return (df)
} # end of function getIndependentScore
test_score <- getIndependentScore(df)
drawHistograms <- function(df,r=5,f=5,m=5){
#set the layout plot window
par(mfrow = c(f,r))
names <-rep("",times=m)
for(i in 1:m) names[i]<-paste("M",i)
for (i in 1:f){
for (j in 1:r){
c <- rep(0,times=m)
for(k in 1:m){
tmpdf <-df[df$R_Score==j & df$F_Score==i & df$M_Score==k,]
c[k]<- dim(tmpdf)[1]
}
if (i==1 & j==1)
barplot(c,col="lightblue",names.arg=names)
else
barplot(c,col="lightblue")
if (j==1) title(ylab=paste("F",i))
if (i==1) title(main=paste("R",j))
}
}
par(mfrow = c(1,1))
} # end of drawHistograms function
drawHistograms(test_score)
# from http://www.dataapple.net/?p=84
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment