Last active
December 30, 2015 02:08
-
-
Save jkuruzovich/7760277 to your computer and use it in GitHub Desktop.
Additional See Click Predict Fix for class
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Start of Kaggle See Click Predict | |
setwd("~/Dropbox/30_classes/analytics/2013_fall/kaggle2") | |
train <- read.csv("train.csv") | |
test <- read.csv("test.csv") | |
str(train) | |
head(train) | |
names(train) | |
#Of all the ways of predicting something, the first we will explore is just taking the mean | |
train.votes.mean<-mean(train$num_votes) | |
train.comments.mean<-mean(train$num_comments) | |
train.views.mean<-mean(train$num_views) | |
#There are lots of ways to assess performance. | |
#https://www.kaggle.com/wiki/Metrics/history/1012 | |
#We will create 2 functions that will give us the RMSE and the MAE | |
# Function that returns Root Mean Log Error | |
rmsle <- function(actual,predicted) | |
{ | |
sqrt(mean((log(actual+1)-log(predicted+1))^2)) | |
} | |
submitfile <-function(filename, id, pred.votes,pred.comments,pred.views ) | |
{ | |
pred.votes<-round(pred.votes, digits = 0) | |
pred.comments<-round(pred.comments, digits = 0) | |
pred.views<-round(pred.views, digits = 0) | |
pred.votes<- ifelse(pred.votes<0,0,pred.votes) | |
pred.comments<- ifelse(pred.comments<0,0,pred.comments) | |
pred.views<- ifelse(pred.views<0,0,pred.views) | |
submit<-data.frame(id,pred.views,pred.votes,pred.comments) | |
colnames(submit)<-c("id", "num_views","num_votes", "num_comments") | |
write.csv(submit, filename,row.names=FALSE) | |
} | |
perf <- function(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views ) | |
{ | |
pred.votes<-round(pred.votes, digits = 0) | |
pred.comments<-round(pred.comments, digits = 0) | |
pred.views<-round(pred.views, digits = 0) | |
pred.votes<- ifelse(pred.votes<0,0,pred.votes) | |
pred.comments<- ifelse(pred.comments<0,0,pred.comments) | |
pred.views<- ifelse(pred.views<0,0,pred.views) | |
#This calculates the error for rmsle as indicated in the analysis. | |
perf.votes.rmsle<-rmsle(act.votes, pred.votes) | |
perf.comments.rmsle<-rmsle(act.comments, pred.comments) | |
perf.views.rmsle<-rmsle(act.views, pred.views) | |
perfsummary <- c(perf.votes.rmsle, perf.comments.rmsle,perf.views.rmsle) | |
} | |
#Let's create a data frame that keeps track of how we do with performance. | |
results <-data.frame(matrix(vector(), 0, 6, dimnames=list(c(), c("data", "method", "votes.rmsle","comments.rmsle", "views.rmsle","avg.rmsle"))), stringsAsFactors=F) | |
#let's assign some intermediate variables so we can reuse code more easily. | |
#Start reuse code | |
pred.votes<-train.votes.mean | |
pred.comments<-train.comments.mean | |
pred.views<-train.views.mean | |
act.votes<-train$num_votes | |
act.comments<-train$num_comments | |
act.views<-train$num_views | |
p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views) | |
pf<-data.frame("train", "mean", p[1],p[2],p[3],mean(p)) | |
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle") | |
results<-rbind(results,pf) | |
write.csv(results, "results.csv",row.names=FALSE) | |
#end reuse code | |
#Now let's use this to generate test data for our data based on this model using a function. | |
submitfile ("functionoutfile3", test$id, pred.votes, pred.comments, pred.views ) | |
#Submission of this landed me at 425. KGI = 1.1915. Compared to .29 for the leaders. | |
#1.19153 | |
#There is a discussion here of how to use the model | |
# https://www.kaggle.com/c/see-click-predict-fix/forums/t/6378/rmsle-vs-target | |
#Now let's apply a simple regression model. | |
#From last class we did an analysis that showed 4 Cities. We are going to just use longitute/latituted to calculate. | |
#1 41.85662 -87.68507 1 Chicago, IL | |
#2 2 37.54046 -77.46269 2 Richmond, VA | |
#3 3 41.31132 -72.92412 3 New Haven, CT | |
#4 4 37.80239 -122.24116 4 Oakland, CA | |
setwd("~/Dropbox/30_classes/analytics/2013_fall/kaggle2") | |
#Initially pull in only 5000 rows to work with data. | |
#train <- read.csv("train.csv", nrows=5000) | |
str(train) | |
head(train) | |
names(train) | |
#One hypothesis could be that in general city is likely to be meaningful. To judge how meaningful our | |
#predictions are we are going to have to compare them to something. Let's compare to just guessing the | |
#overall mean each time. | |
# We are also going to do all data fixes to our test file so we are ready to submit. | |
train$city <- cut(train$longitude, | |
breaks=c(-Inf, -104, -82.5, -75.2, Inf), | |
labels=c('Oakland','Chicago', 'Richmond', 'New Haven')) | |
test$city <- cut(test$longitude, | |
breaks=c(-Inf, -104, -82.5, -75.2, Inf), | |
labels=c('Oakland','Chicago', 'Richmond', 'New Haven')) | |
#Look to see potential variables | |
str(train) | |
train$summarync<-nchar(as.character(train$summary)) | |
train$descriptionnc<-nchar(as.character(train$description)) | |
test$summarync<-nchar(as.character(test$summary)) | |
test$descriptionnc<-nchar(as.character(test$description)) | |
#We can see that created_time is incorrectly being identified as a factor, going to change first to string | |
time_char<-as.character((train$created_time)) | |
#Then to POSIX (time) | |
train$time<-as.POSIXct(strptime(time_char, format = "%Y-%m-%d %H:%M:%S")) | |
train$source = factor(train$source, levels=c(levels(train$source), "unknown")) | |
train$source[is.na(train$source )] = "unknown" | |
train$tag = factor(train$tag, levels=c(levels(train$tag), "unknown")) | |
train$tag[is.na(train$tag )] = "unknown" | |
#optional recoding | |
train$tag[train$tag=="abandoned_vehicles"]<-"abandoned_vehicle" | |
train$tag[train$tag=="lost_and_found"]<-"unknown" | |
train$tag[train$tag=="other"]<-"unknown" | |
train$tag[train$tag=="overgrowth"]<-"tree" | |
train$tag[train$tag=="pedestrian_light"]<-"traffic" | |
train$tag[train$tag=="public_art"]<-"unknown" | |
train$tag[train$tag=="public_concern"]<-"unknown" | |
train$tag[train$tag=="roadkill"]<-"traffic" | |
train$tag[train$tag=="street_light"]<-"traffic" | |
train$tag[train$tag=="street_signal"]<-"traffic" | |
train$tag[train$tag=="zoning"]<-"unknown" | |
train$tag<-factor(as.character(train$tag)) | |
#test data | |
time_chart<-as.character((test$created_time)) | |
#Then to POSIX (time) | |
test$time<-as.POSIXct(strptime(time_chart, format = "%Y-%m-%d %H:%M:%S")) | |
test$source = factor(test$source, levels=c(levels(test$source), "unknown")) | |
test$source[is.na(test$source )] = "unknown" | |
test$tag = factor(test$tag, levels=c(levels(test$tag), "unknown")) | |
test$tag[is.na(test$tag )] = "unknown" | |
test$tag[test$tag=="abandoned_vehicles"]<-"abandoned_vehicle" | |
test$tag[test$tag=="lost_and_found"]<-"unknown" | |
test$tag[test$tag=="other"]<-"unknown" | |
test$tag[test$tag=="overgrowth"]<-"tree" | |
test$tag[test$tag=="pedestrian_light"]<-"traffic" | |
test$tag[test$tag=="public_art"]<-"unknown" | |
test$tag[test$tag=="public_concern"]<-"unknown" | |
test$tag[test$tag=="roadkill"]<-"traffic" | |
test$tag[test$tag=="street_light"]<-"traffic" | |
test$tag[test$tag=="street_signal"]<-"traffic" | |
test$tag[test$tag=="zoning"]<-"unknown" | |
test$tag<-factor(as.character(test$tag)) | |
#Now select a minimal | |
train_min<-train[,c('num_votes','num_comments','num_views','latitude', 'longitude', 'time', 'tag', 'source','city','summarync', 'descriptionnc')] | |
#verify complete data | |
train_min[!complete.cases(train_min),] | |
#to see how we do from here out, we can split our training set in 2 | |
set.seed(1234) | |
ind <- sample(2, nrow(train_min), replace=TRUE, prob=c(0.5,0.5)) | |
#Split the data | |
train_a <- train_min[ind==1,] | |
train_b <- train_min[ind==2,] | |
c<-ncol(train_a) | |
train_a_votes<-train_a[,c(1,4:c)] | |
train_a_comm<-train_a[,c(2,4:c)] | |
train_a_view<-train_a[,c(3,4:c)] | |
#Let's Just toss everyting into a linear regression. | |
lm.fit.votes <- lm(num_votes~., data=train_a_votes) | |
summary(lm.fit.votes) | |
lm.fit.comm <- lm(num_comments~., data=train_a_comm) | |
summary(lm.fit.comm) | |
lm.fit.view <- lm(num_views~., data=train_a_view) | |
summary(lm.fit.view) | |
#Now let make our predictions for both our train_a | |
train_a$pvotes <- predict(lm.fit.votes) | |
train_a$pcomments <- predict(lm.fit.comm) | |
train_a$pviews <- predict(lm.fit.view) | |
#Start reuse code | |
#set predicted values | |
pred.votes<-train_a$pvotes | |
pred.comments<-train_a$pcomments | |
pred.views<-train_a$pviews | |
#set actual values | |
act.votes<-train_a$num_votes | |
act.comments<-train_a$num_comments | |
act.views<-train_a$num_views | |
#let's assign some intermediate variables so we can reuse code more easily. | |
#Start reuse code | |
p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views) | |
pf<-data.frame("train_a", "regression", p[1],p[2],p[3],mean(p)) | |
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle") | |
results<-rbind(results,pf) | |
#end reuse code | |
#check out how we did compared with the other code. A nice improvement! | |
View(results) | |
#This is an example of cross validation. | |
pred.votes<- predict(lm.fit.votes, newdata=train_b) | |
pred.comments <- predict(lm.fit.comm, newdata=train_b) | |
pred.views <- predict(lm.fit.view, newdata=train_b) | |
#set actual values | |
act.votes<-train_b$num_votes | |
act.comments<-train_b$num_comments | |
act.views<-train_b$num_views | |
#calcuate performance | |
p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views) | |
pf<-data.frame("train_b", "regression", p[1],p[2],p[3],mean(p)) | |
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle") | |
results<-rbind(results,pf) | |
#end reuse code | |
#OK, Let's try to predict our test data. Get prepared for an error. | |
pred.votes<- predict(lm.fit.votes, newdata=test) | |
#Basically this error means that a factor varaible in the test set has a level we don't know about. Let's just | |
#recode it to unknown. | |
test$tag[(test$tag=="bus_lane" )] = "unknown" | |
#OK, Let's try again. | |
pred.votes<- predict(lm.fit.votes, newdata=test) | |
pred.comments <- predict(lm.fit.comm, newdata=test) | |
pred.views <- predict(lm.fit.view, newdata=test) | |
#Now let's use this to generate our submission file. | |
submitfile ("regression3.csv", test$id, pred.votes, pred.comments, pred.views ) | |
#install.packages('nnet') | |
require(nnet) | |
#Now let's see how neural networks do. | |
nnet.fit.votes <- nnet(num_votes~., data=train_a_votes, size = 6 ) | |
nnet.fit.comm <- nnet(num_comments~., data=train_a_comm, size = 6 ) | |
nnet.fit.views<- nnet(num_views~., data=train_a_view, size = 6) | |
pred.votes<-predict(nnet.fit.votes) | |
pred.comments<-predict(nnet.fit.comments) | |
pred.views<-predict(nnet.fit.views) | |
#set actual values | |
act.votes<-train_a$num_votes | |
act.comments<-train_a$num_comments | |
act.views<-train_a$num_views | |
#let's assign some intermediate variables so we can reuse code more easily. | |
#Start reuse code | |
p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views) | |
pf<-data.frame("train_a", "NeuralNetwork-Size-6", p[1],p[2],p[3],mean(p)) | |
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle") | |
results<-rbind(results,pf) | |
#Well That didn't go so Well. Change the tuning paramer size and retry | |
#install.packages('randomForest') | |
library(randomForest) | |
#train_a_viewb<-train_a_view[,c(1:4,6,7)] | |
#train_a_commb<-train_a_comm[,c(1:4,6,7)] | |
#train_a_votesb<-train_a_votes[,c(1:4,6,7)] | |
rf.fit.views<- randomForest(num_views~., data=train_a_view, maxnodes=30) | |
rf.fit.comm<- randomForest(num_comments~., data=train_a_comm, maxnodes=30) | |
rf.fit.votes<- randomForest(num_votes~., data=train_a_votes, maxnodes=30) | |
pred.votes<-predict(rf.fit.votes) | |
pred.comments<-predict(rf.fit.comm) | |
pred.views<-predict(rf.fit.views) | |
#set actual values | |
act.votes<-train_a$num_votes | |
act.comments<-train_a$num_comments | |
act.views<-train_a$num_views | |
#let's assign some intermediate variables so we can reuse code more easily. | |
#Start reuse code | |
p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views) | |
pf<-data.frame("train_a", "Random Forest-30", p[1],p[2],p[3],mean(p)) | |
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle") | |
results<-rbind(results,pf) | |
pred.votes<-predict(rf.fit.votes) | |
pred.comments<-predict(rf.fit.comm) | |
pred.views<-predict(rf.fit.views) | |
#set actual values | |
act.votes<-train_a$num_votes | |
act.comments<-train_a$num_comments | |
act.views<-train_a$num_views | |
#let's assign some intermediate variables so we can reuse code more easily. | |
#Start reuse code | |
p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views) | |
pf<-data.frame("train_a", "random forest maxnodes 10", p[1],p[2],p[3],mean(p)) | |
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle") | |
results<-rbind(results,pf) | |
pred.votes<- predict(rf.fit.votes, newdata=train_b) | |
pred.comments <- predict(rf.fit.comm, newdata=train_b) | |
pred.views <- predict(rf.fit.views, newdata=train_b) | |
#set actual values | |
act.votes<-train_b$num_votes | |
act.comments<-train_b$num_comments | |
act.views<-train_b$num_views | |
#let's assign some intermediate variables so we can reuse code more easily. | |
#Start reuse code | |
p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views) | |
pf<-data.frame("train_b", "random forest 30", p[1],p[2],p[3],mean(p)) | |
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle") | |
results<-rbind(results,pf) | |
rf.fit.views.o<- randomForest(num_views~., data=train_a_view[train_a_view$city=="Oakland",], maxnodes=20) | |
rf.fit.views.c<- randomForest(num_views~., data=train_a_view[train_a_view$city=="Chicago",], maxnodes=20) | |
rf.fit.views.r<- randomForest(num_views~., data=train_a_view[train_a_view$city=="Richmond",], maxnodes=20) | |
rf.fit.views.n<- randomForest(num_views~., data=train_a_view[train_a_view$city=="New Haven",], maxnodes=20) | |
rf.fit.comm.o<- randomForest(num_comments~., data=train_a_comm[train_a_view$city=="Oakland",], maxnodes=20) | |
rf.fit.comm.c<- randomForest(num_comments~., data=train_a_comm[train_a_view$city=="Chicago",], maxnodes=20) | |
rf.fit.comm.r<- randomForest(num_comments~., data=train_a_comm[train_a_view$city=="Richmond",], maxnodes=20) | |
rf.fit.comm.n<- randomForest(num_comments~., data=train_a_comm[train_a_view$city=="New Haven",], maxnodes=20) | |
rf.fit.votes.o<- randomForest(num_votes~., data=train_a_votes[train_a_view$city=="Oakland",], maxnodes=20) | |
rf.fit.votes.c<- randomForest(num_votes~., data=train_a_votes[train_a_view$city=="Chicago",], maxnodes=20) | |
rf.fit.votes.r<- randomForest(num_votes~., data=train_a_votes[train_a_view$city=="Richmond",], maxnodes=20) | |
rf.fit.votes.n<- randomForest(num_votes~., data=train_a_votes[train_a_view$city=="New Haven",], maxnodes=20) | |
pred.votes.o<-predict(rf.fit.votes.o, newdata=train_a[train_a_view$city=="Oakland",]) | |
pred.votes.c<-predict(rf.fit.votes.c, newdata=train_a[train_a_view$city=="Chicago",]) | |
pred.votes.r<-predict(rf.fit.votes.r, newdata=train_a[train_a_view$city=="Richmond",]) | |
pred.votes.n<-predict(rf.fit.votes.n, newdata=train_a[train_a_view$city=="New Haven",]) | |
#set actual values | |
act.votes.o<-train_a$num_votes[train_a$city=="Oakland"] | |
act.votes.c<-train_a$num_votes[train_a$city=="Chicago"] | |
act.votes.r<-train_a$num_votes[train_a$city=="Richmond"] | |
act.votes.n<-train_a$num_votes[train_a$city=="New Haven"] | |
pred.votes<-pred.votes.n | |
act.votes<-act.votes.n | |
#let's assign some intermediate variables so we can reuse code more easily. | |
#Start reuse code | |
p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views) | |
pf<-data.frame("train_a", "random forest 20-votes-New Haven", p[1],p[2],p[3],mean(p)) | |
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle") | |
results<-rbind(results,pf) | |
#calculate final prediction | |
#take the final 40% of the data | |
223129-(223129*(.4)) | |
train_a<-train_min[100000:223129,] | |
c<-ncol(train_a) | |
train_a_votes<-train_a[,c(1,4:c)] | |
train_a_comm<-train_a[,c(2,4:c)] | |
train_a_view<-train_a[,c(3,4:c)] | |
rf.fit.views<- randomForest(num_views~., data=train_a_view, maxnodes=20) | |
rf.fit.comm<- randomForest(num_comments~., data=train_a_comm, maxnodes=20) | |
rf.fit.votes<- randomForest(num_votes~., data=train_a_votes, maxnodes=20) | |
pred.votes<-predict(rf.fit.votes) | |
pred.comments<-predict(rf.fit.comm) | |
pred.views<-predict(rf.fit.views) | |
#set actual values | |
act.votes<-train_a$num_votes | |
act.comments<-train_a$num_comments | |
act.views<-train_a$num_views | |
#let's assign some intermediate variables so we can reuse code more easily. | |
#Start reuse code | |
p<-perf(actr.votes, pred.votes,act.comments, pred.comments, act.views, pred.views) | |
pf<-data.frame("train_a", "Random Forest-30", p[1],p[2],p[3],mean(p)) | |
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle") | |
results<-rbind(results,pf) | |
#OK, Let's try again. | |
pred.votes<-predict(rf.fit.votes, newdata=test) | |
pred.comments<-predict(rf.fit.comm, newdata=test) | |
pred.views<-predict(rf.fit.views,newdata=test) | |
pred.votes<- predict(rf.fit.votes, newdata=test) | |
pred.comments <- predict(rf.fit.comm, newdata=test) | |
pred.views <- predict(rf.fit.view, newdata=test) | |
#Now let's use this to generate our submission file. | |
submitfile ("randomfores.csv", test$id, pred.votes, pred.comments, pred.views ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment