Skip to content

Instantly share code, notes, and snippets.

@jkuruzovich
Last active December 30, 2015 02:08
Show Gist options
  • Save jkuruzovich/7760277 to your computer and use it in GitHub Desktop.
Save jkuruzovich/7760277 to your computer and use it in GitHub Desktop.
Additional See Click Predict Fix for class
#Start of Kaggle See Click Predict
setwd("~/Dropbox/30_classes/analytics/2013_fall/kaggle2")
train <- read.csv("train.csv")
test <- read.csv("test.csv")
str(train)
head(train)
names(train)
#Of all the ways of predicting something, the first we will explore is just taking the mean
train.votes.mean<-mean(train$num_votes)
train.comments.mean<-mean(train$num_comments)
train.views.mean<-mean(train$num_views)
#There are lots of ways to assess performance.
#https://www.kaggle.com/wiki/Metrics/history/1012
#We will create 2 functions that will give us the RMSE and the MAE
# Function that returns Root Mean Log Error
rmsle <- function(actual,predicted)
{
sqrt(mean((log(actual+1)-log(predicted+1))^2))
}
submitfile <-function(filename, id, pred.votes,pred.comments,pred.views )
{
pred.votes<-round(pred.votes, digits = 0)
pred.comments<-round(pred.comments, digits = 0)
pred.views<-round(pred.views, digits = 0)
pred.votes<- ifelse(pred.votes<0,0,pred.votes)
pred.comments<- ifelse(pred.comments<0,0,pred.comments)
pred.views<- ifelse(pred.views<0,0,pred.views)
submit<-data.frame(id,pred.views,pred.votes,pred.comments)
colnames(submit)<-c("id", "num_views","num_votes", "num_comments")
write.csv(submit, filename,row.names=FALSE)
}
perf <- function(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views )
{
pred.votes<-round(pred.votes, digits = 0)
pred.comments<-round(pred.comments, digits = 0)
pred.views<-round(pred.views, digits = 0)
pred.votes<- ifelse(pred.votes<0,0,pred.votes)
pred.comments<- ifelse(pred.comments<0,0,pred.comments)
pred.views<- ifelse(pred.views<0,0,pred.views)
#This calculates the error for rmsle as indicated in the analysis.
perf.votes.rmsle<-rmsle(act.votes, pred.votes)
perf.comments.rmsle<-rmsle(act.comments, pred.comments)
perf.views.rmsle<-rmsle(act.views, pred.views)
perfsummary <- c(perf.votes.rmsle, perf.comments.rmsle,perf.views.rmsle)
}
#Let's create a data frame that keeps track of how we do with performance.
results <-data.frame(matrix(vector(), 0, 6, dimnames=list(c(), c("data", "method", "votes.rmsle","comments.rmsle", "views.rmsle","avg.rmsle"))), stringsAsFactors=F)
#let's assign some intermediate variables so we can reuse code more easily.
#Start reuse code
pred.votes<-train.votes.mean
pred.comments<-train.comments.mean
pred.views<-train.views.mean
act.votes<-train$num_votes
act.comments<-train$num_comments
act.views<-train$num_views
p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views)
pf<-data.frame("train", "mean", p[1],p[2],p[3],mean(p))
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle")
results<-rbind(results,pf)
write.csv(results, "results.csv",row.names=FALSE)
#end reuse code
#Now let's use this to generate test data for our data based on this model using a function.
submitfile ("functionoutfile3", test$id, pred.votes, pred.comments, pred.views )
#Submission of this landed me at 425. KGI = 1.1915. Compared to .29 for the leaders.
#1.19153
#There is a discussion here of how to use the model
# https://www.kaggle.com/c/see-click-predict-fix/forums/t/6378/rmsle-vs-target
#Now let's apply a simple regression model.
#From last class we did an analysis that showed 4 Cities. We are going to just use longitute/latituted to calculate.
#1 41.85662 -87.68507 1 Chicago, IL
#2 2 37.54046 -77.46269 2 Richmond, VA
#3 3 41.31132 -72.92412 3 New Haven, CT
#4 4 37.80239 -122.24116 4 Oakland, CA
setwd("~/Dropbox/30_classes/analytics/2013_fall/kaggle2")
#Initially pull in only 5000 rows to work with data.
#train <- read.csv("train.csv", nrows=5000)
str(train)
head(train)
names(train)
#One hypothesis could be that in general city is likely to be meaningful. To judge how meaningful our
#predictions are we are going to have to compare them to something. Let's compare to just guessing the
#overall mean each time.
# We are also going to do all data fixes to our test file so we are ready to submit.
train$city <- cut(train$longitude,
breaks=c(-Inf, -104, -82.5, -75.2, Inf),
labels=c('Oakland','Chicago', 'Richmond', 'New Haven'))
test$city <- cut(test$longitude,
breaks=c(-Inf, -104, -82.5, -75.2, Inf),
labels=c('Oakland','Chicago', 'Richmond', 'New Haven'))
#Look to see potential variables
str(train)
train$summarync<-nchar(as.character(train$summary))
train$descriptionnc<-nchar(as.character(train$description))
test$summarync<-nchar(as.character(test$summary))
test$descriptionnc<-nchar(as.character(test$description))
#We can see that created_time is incorrectly being identified as a factor, going to change first to string
time_char<-as.character((train$created_time))
#Then to POSIX (time)
train$time<-as.POSIXct(strptime(time_char, format = "%Y-%m-%d %H:%M:%S"))
train$source = factor(train$source, levels=c(levels(train$source), "unknown"))
train$source[is.na(train$source )] = "unknown"
train$tag = factor(train$tag, levels=c(levels(train$tag), "unknown"))
train$tag[is.na(train$tag )] = "unknown"
#optional recoding
train$tag[train$tag=="abandoned_vehicles"]<-"abandoned_vehicle"
train$tag[train$tag=="lost_and_found"]<-"unknown"
train$tag[train$tag=="other"]<-"unknown"
train$tag[train$tag=="overgrowth"]<-"tree"
train$tag[train$tag=="pedestrian_light"]<-"traffic"
train$tag[train$tag=="public_art"]<-"unknown"
train$tag[train$tag=="public_concern"]<-"unknown"
train$tag[train$tag=="roadkill"]<-"traffic"
train$tag[train$tag=="street_light"]<-"traffic"
train$tag[train$tag=="street_signal"]<-"traffic"
train$tag[train$tag=="zoning"]<-"unknown"
train$tag<-factor(as.character(train$tag))
#test data
time_chart<-as.character((test$created_time))
#Then to POSIX (time)
test$time<-as.POSIXct(strptime(time_chart, format = "%Y-%m-%d %H:%M:%S"))
test$source = factor(test$source, levels=c(levels(test$source), "unknown"))
test$source[is.na(test$source )] = "unknown"
test$tag = factor(test$tag, levels=c(levels(test$tag), "unknown"))
test$tag[is.na(test$tag )] = "unknown"
test$tag[test$tag=="abandoned_vehicles"]<-"abandoned_vehicle"
test$tag[test$tag=="lost_and_found"]<-"unknown"
test$tag[test$tag=="other"]<-"unknown"
test$tag[test$tag=="overgrowth"]<-"tree"
test$tag[test$tag=="pedestrian_light"]<-"traffic"
test$tag[test$tag=="public_art"]<-"unknown"
test$tag[test$tag=="public_concern"]<-"unknown"
test$tag[test$tag=="roadkill"]<-"traffic"
test$tag[test$tag=="street_light"]<-"traffic"
test$tag[test$tag=="street_signal"]<-"traffic"
test$tag[test$tag=="zoning"]<-"unknown"
test$tag<-factor(as.character(test$tag))
#Now select a minimal
train_min<-train[,c('num_votes','num_comments','num_views','latitude', 'longitude', 'time', 'tag', 'source','city','summarync', 'descriptionnc')]
#verify complete data
train_min[!complete.cases(train_min),]
#to see how we do from here out, we can split our training set in 2
set.seed(1234)
ind <- sample(2, nrow(train_min), replace=TRUE, prob=c(0.5,0.5))
#Split the data
train_a <- train_min[ind==1,]
train_b <- train_min[ind==2,]
c<-ncol(train_a)
train_a_votes<-train_a[,c(1,4:c)]
train_a_comm<-train_a[,c(2,4:c)]
train_a_view<-train_a[,c(3,4:c)]
#Let's Just toss everyting into a linear regression.
lm.fit.votes <- lm(num_votes~., data=train_a_votes)
summary(lm.fit.votes)
lm.fit.comm <- lm(num_comments~., data=train_a_comm)
summary(lm.fit.comm)
lm.fit.view <- lm(num_views~., data=train_a_view)
summary(lm.fit.view)
#Now let make our predictions for both our train_a
train_a$pvotes <- predict(lm.fit.votes)
train_a$pcomments <- predict(lm.fit.comm)
train_a$pviews <- predict(lm.fit.view)
#Start reuse code
#set predicted values
pred.votes<-train_a$pvotes
pred.comments<-train_a$pcomments
pred.views<-train_a$pviews
#set actual values
act.votes<-train_a$num_votes
act.comments<-train_a$num_comments
act.views<-train_a$num_views
#let's assign some intermediate variables so we can reuse code more easily.
#Start reuse code
p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views)
pf<-data.frame("train_a", "regression", p[1],p[2],p[3],mean(p))
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle")
results<-rbind(results,pf)
#end reuse code
#check out how we did compared with the other code. A nice improvement!
View(results)
#This is an example of cross validation.
pred.votes<- predict(lm.fit.votes, newdata=train_b)
pred.comments <- predict(lm.fit.comm, newdata=train_b)
pred.views <- predict(lm.fit.view, newdata=train_b)
#set actual values
act.votes<-train_b$num_votes
act.comments<-train_b$num_comments
act.views<-train_b$num_views
#calcuate performance
p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views)
pf<-data.frame("train_b", "regression", p[1],p[2],p[3],mean(p))
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle")
results<-rbind(results,pf)
#end reuse code
#OK, Let's try to predict our test data. Get prepared for an error.
pred.votes<- predict(lm.fit.votes, newdata=test)
#Basically this error means that a factor varaible in the test set has a level we don't know about. Let's just
#recode it to unknown.
test$tag[(test$tag=="bus_lane" )] = "unknown"
#OK, Let's try again.
pred.votes<- predict(lm.fit.votes, newdata=test)
pred.comments <- predict(lm.fit.comm, newdata=test)
pred.views <- predict(lm.fit.view, newdata=test)
#Now let's use this to generate our submission file.
submitfile ("regression3.csv", test$id, pred.votes, pred.comments, pred.views )
#install.packages('nnet')
require(nnet)
#Now let's see how neural networks do.
nnet.fit.votes <- nnet(num_votes~., data=train_a_votes, size = 6 )
nnet.fit.comm <- nnet(num_comments~., data=train_a_comm, size = 6 )
nnet.fit.views<- nnet(num_views~., data=train_a_view, size = 6)
pred.votes<-predict(nnet.fit.votes)
pred.comments<-predict(nnet.fit.comments)
pred.views<-predict(nnet.fit.views)
#set actual values
act.votes<-train_a$num_votes
act.comments<-train_a$num_comments
act.views<-train_a$num_views
#let's assign some intermediate variables so we can reuse code more easily.
#Start reuse code
p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views)
pf<-data.frame("train_a", "NeuralNetwork-Size-6", p[1],p[2],p[3],mean(p))
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle")
results<-rbind(results,pf)
#Well That didn't go so Well. Change the tuning paramer size and retry
#install.packages('randomForest')
library(randomForest)
#train_a_viewb<-train_a_view[,c(1:4,6,7)]
#train_a_commb<-train_a_comm[,c(1:4,6,7)]
#train_a_votesb<-train_a_votes[,c(1:4,6,7)]
rf.fit.views<- randomForest(num_views~., data=train_a_view, maxnodes=30)
rf.fit.comm<- randomForest(num_comments~., data=train_a_comm, maxnodes=30)
rf.fit.votes<- randomForest(num_votes~., data=train_a_votes, maxnodes=30)
pred.votes<-predict(rf.fit.votes)
pred.comments<-predict(rf.fit.comm)
pred.views<-predict(rf.fit.views)
#set actual values
act.votes<-train_a$num_votes
act.comments<-train_a$num_comments
act.views<-train_a$num_views
#let's assign some intermediate variables so we can reuse code more easily.
#Start reuse code
p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views)
pf<-data.frame("train_a", "Random Forest-30", p[1],p[2],p[3],mean(p))
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle")
results<-rbind(results,pf)
pred.votes<-predict(rf.fit.votes)
pred.comments<-predict(rf.fit.comm)
pred.views<-predict(rf.fit.views)
#set actual values
act.votes<-train_a$num_votes
act.comments<-train_a$num_comments
act.views<-train_a$num_views
#let's assign some intermediate variables so we can reuse code more easily.
#Start reuse code
p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views)
pf<-data.frame("train_a", "random forest maxnodes 10", p[1],p[2],p[3],mean(p))
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle")
results<-rbind(results,pf)
pred.votes<- predict(rf.fit.votes, newdata=train_b)
pred.comments <- predict(rf.fit.comm, newdata=train_b)
pred.views <- predict(rf.fit.views, newdata=train_b)
#set actual values
act.votes<-train_b$num_votes
act.comments<-train_b$num_comments
act.views<-train_b$num_views
#let's assign some intermediate variables so we can reuse code more easily.
#Start reuse code
p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views)
pf<-data.frame("train_b", "random forest 30", p[1],p[2],p[3],mean(p))
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle")
results<-rbind(results,pf)
rf.fit.views.o<- randomForest(num_views~., data=train_a_view[train_a_view$city=="Oakland",], maxnodes=20)
rf.fit.views.c<- randomForest(num_views~., data=train_a_view[train_a_view$city=="Chicago",], maxnodes=20)
rf.fit.views.r<- randomForest(num_views~., data=train_a_view[train_a_view$city=="Richmond",], maxnodes=20)
rf.fit.views.n<- randomForest(num_views~., data=train_a_view[train_a_view$city=="New Haven",], maxnodes=20)
rf.fit.comm.o<- randomForest(num_comments~., data=train_a_comm[train_a_view$city=="Oakland",], maxnodes=20)
rf.fit.comm.c<- randomForest(num_comments~., data=train_a_comm[train_a_view$city=="Chicago",], maxnodes=20)
rf.fit.comm.r<- randomForest(num_comments~., data=train_a_comm[train_a_view$city=="Richmond",], maxnodes=20)
rf.fit.comm.n<- randomForest(num_comments~., data=train_a_comm[train_a_view$city=="New Haven",], maxnodes=20)
rf.fit.votes.o<- randomForest(num_votes~., data=train_a_votes[train_a_view$city=="Oakland",], maxnodes=20)
rf.fit.votes.c<- randomForest(num_votes~., data=train_a_votes[train_a_view$city=="Chicago",], maxnodes=20)
rf.fit.votes.r<- randomForest(num_votes~., data=train_a_votes[train_a_view$city=="Richmond",], maxnodes=20)
rf.fit.votes.n<- randomForest(num_votes~., data=train_a_votes[train_a_view$city=="New Haven",], maxnodes=20)
pred.votes.o<-predict(rf.fit.votes.o, newdata=train_a[train_a_view$city=="Oakland",])
pred.votes.c<-predict(rf.fit.votes.c, newdata=train_a[train_a_view$city=="Chicago",])
pred.votes.r<-predict(rf.fit.votes.r, newdata=train_a[train_a_view$city=="Richmond",])
pred.votes.n<-predict(rf.fit.votes.n, newdata=train_a[train_a_view$city=="New Haven",])
#set actual values
act.votes.o<-train_a$num_votes[train_a$city=="Oakland"]
act.votes.c<-train_a$num_votes[train_a$city=="Chicago"]
act.votes.r<-train_a$num_votes[train_a$city=="Richmond"]
act.votes.n<-train_a$num_votes[train_a$city=="New Haven"]
pred.votes<-pred.votes.n
act.votes<-act.votes.n
#let's assign some intermediate variables so we can reuse code more easily.
#Start reuse code
p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views)
pf<-data.frame("train_a", "random forest 20-votes-New Haven", p[1],p[2],p[3],mean(p))
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle")
results<-rbind(results,pf)
#calculate final prediction
#take the final 40% of the data
223129-(223129*(.4))
train_a<-train_min[100000:223129,]
c<-ncol(train_a)
train_a_votes<-train_a[,c(1,4:c)]
train_a_comm<-train_a[,c(2,4:c)]
train_a_view<-train_a[,c(3,4:c)]
rf.fit.views<- randomForest(num_views~., data=train_a_view, maxnodes=20)
rf.fit.comm<- randomForest(num_comments~., data=train_a_comm, maxnodes=20)
rf.fit.votes<- randomForest(num_votes~., data=train_a_votes, maxnodes=20)
pred.votes<-predict(rf.fit.votes)
pred.comments<-predict(rf.fit.comm)
pred.views<-predict(rf.fit.views)
#set actual values
act.votes<-train_a$num_votes
act.comments<-train_a$num_comments
act.views<-train_a$num_views
#let's assign some intermediate variables so we can reuse code more easily.
#Start reuse code
p<-perf(actr.votes, pred.votes,act.comments, pred.comments, act.views, pred.views)
pf<-data.frame("train_a", "Random Forest-30", p[1],p[2],p[3],mean(p))
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle")
results<-rbind(results,pf)
#OK, Let's try again.
pred.votes<-predict(rf.fit.votes, newdata=test)
pred.comments<-predict(rf.fit.comm, newdata=test)
pred.views<-predict(rf.fit.views,newdata=test)
pred.votes<- predict(rf.fit.votes, newdata=test)
pred.comments <- predict(rf.fit.comm, newdata=test)
pred.views <- predict(rf.fit.view, newdata=test)
#Now let's use this to generate our submission file.
submitfile ("randomfores.csv", test$id, pred.votes, pred.comments, pred.views )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment