Skip to content

Instantly share code, notes, and snippets.

@jkuruzovich
Created November 26, 2013 14:20
Show Gist options
  • Save jkuruzovich/7659050 to your computer and use it in GitHub Desktop.
Save jkuruzovich/7659050 to your computer and use it in GitHub Desktop.
Kaggle_cross_validations
# This code has some additional functions that can help show you how to deal with the missing values and cross validation.
#Start of Kaggle See Click Predict
setwd("~/Dropbox/30_classes/analytics/2013_fall/kaggle2")
train <- read.csv("train.csv")
test <- read.csv("test.csv")
str(train)
head(train)
names(train)
#Of all the ways of predicting something, the first we will explore is just taking the mean
train.votes.mean<-mean(train$num_votes)
train.comments.mean<-mean(train$num_comments)
train.views.mean<-mean(train$num_views)
#There are lots of ways to assess performance.
#https://www.kaggle.com/wiki/Metrics/history/1012
#We will create 2 functions that will give us the RMSE and the MAE
# Function that returns Root Mean Log Error
rmsle <- function(actual,predicted)
{
sqrt(mean((log(actual+1)-log(predicted+1))^2))
}
submitfile <-function(filename, id, pred.votes,pred.comments,pred.views )
{
pred.votes<-round(pred.votes, digits = 0)
pred.comments<-round(pred.comments, digits = 0)
pred.views<-round(pred.views, digits = 0)
pred.votes<- ifelse(pred.votes<0,0,pred.votes)
pred.comments<- ifelse(pred.comments<0,0,pred.comments)
pred.views<- ifelse(pred.views<0,0,pred.views)
submit<-data.frame(id,pred.views,pred.votes,pred.comments)
colnames(submit)<-c("id", "num_views","num_votes", "num_comments")
write.csv(submit, filename,row.names=FALSE)
}
perf <- function(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views )
{
pred.votes<-round(pred.votes, digits = 0)
pred.comments<-round(pred.comments, digits = 0)
pred.views<-round(pred.views, digits = 0)
pred.votes<- ifelse(pred.votes<0,0,pred.votes)
pred.comments<- ifelse(pred.comments<0,0,pred.comments)
pred.views<- ifelse(pred.views<0,0,pred.views)
#This calculates the error for rmsle as indicated in the analysis.
perf.votes.rmsle<-rmsle(act.votes, pred.votes)
perf.comments.rmsle<-rmsle(act.comments, pred.comments)
perf.views.rmsle<-rmsle(act.views, pred.views)
perfsummary <- c(perf.votes.rmsle, perf.comments.rmsle,perf.views.rmsle)
}
#Let's create a data frame that keeps track of how we do with performance.
results <-data.frame(matrix(vector(), 0, 6, dimnames=list(c(), c("data", "method", "votes.rmsle","comments.rmsle", "views.rmsle","avg.rmsle"))), stringsAsFactors=F)
#let's assign some intermediate variables so we can reuse code more easily.
#Start reuse code
pred.votes<-train.votes.mean
pred.comments<-train.comments.mean
pred.views<-train.views.mean
act.votes<-train$num_votes
act.comments<-train$num_comments
act.views<-train$num_views
p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views)
pf<-data.frame("train", "mean", p[1],p[2],p[3],mean(p))
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle")
results<-rbind(results,pf)
write.csv(results, "results.csv",row.names=FALSE)
#end reuse code
#Now let's use this to generate test data for our data based on this model using a function.
submitfile ("functionoutfile3", test$id, pred.votes, pred.comments, pred.views )
#Submission of this landed me at 425. KGI = 1.1915. Compared to .29 for the leaders.
#1.19153
#There is a discussion here of how to use the model
# https://www.kaggle.com/c/see-click-predict-fix/forums/t/6378/rmsle-vs-target
#Now let's apply a simple regression model.
#From last class we did an analysis that showed 4 Cities. We are going to just use longitute/latituted to calculate.
#1 41.85662 -87.68507 1 Chicago, IL
#2 2 37.54046 -77.46269 2 Richmond, VA
#3 3 41.31132 -72.92412 3 New Haven, CT
#4 4 37.80239 -122.24116 4 Oakland, CA
setwd("~/Dropbox/30_classes/analytics/2013_fall/kaggle2")
#Initially pull in only 5000 rows to work with data.
#train <- read.csv("train.csv", nrows=5000)
str(train)
head(train)
names(train)
#One hypothesis could be that in general city is likely to be meaningful. To judge how meaningful our
#predictions are we are going to have to compare them to something. Let's compare to just guessing the
#overall mean each time.
# We are also going to do all data fixes to our test file so we are ready to submit.
train$city <- cut(train$longitude,
breaks=c(-Inf, -104, -82.5, -75.2, Inf),
labels=c('Oakland','Chicago', 'Richmond', 'New Haven'))
test$city <- cut(test$longitude,
breaks=c(-Inf, -104, -82.5, -75.2, Inf),
labels=c('Oakland','Chicago', 'Richmond', 'New Haven'))
#Look to see potential variables
str(train)
#We can see that created_time is incorrectly being identified as a factor, going to change first to string
time_char<-as.character((train$created_time))
#Then to POSIX (time)
train$time<-as.POSIXct(strptime(time_char, format = "%Y-%m-%d %H:%M:%S"))
train$source = factor(train$source, levels=c(levels(train$source), "unknown"))
train$source[is.na(train$source )] = "unknown"
train$tag = factor(train$tag, levels=c(levels(train$tag), "unknown"))
train$tag[is.na(train$tag )] = "unknown"
#test data
time_chart<-as.character((test$created_time))
#Then to POSIX (time)
test$time<-as.POSIXct(strptime(time_chart, format = "%Y-%m-%d %H:%M:%S"))
test$source = factor(test$source, levels=c(levels(test$source), "unknown"))
test$source[is.na(test$source )] = "unknown"
test$tag = factor(test$tag, levels=c(levels(test$tag), "unknown"))
test$tag[is.na(test$tag )] = "unknown"
#Now select a minima
train_min<-train[,c('num_votes','num_comments','num_views','latitude', 'longitude', 'time', 'tag', 'source','city')]
#verify complete data
train_min[!complete.cases(train_min),]
#to see how we do from here out, we can split our training set in 2
set.seed(1234)
ind <- sample(2, nrow(train_min), replace=TRUE, prob=c(0.5,0.5))
#Split the data
train_a <- train_min[ind==1,]
train_b <- train_min[ind==2,]
c<-ncol(train_a)
train_a_votes<-train_a[,c(1,4:c)]
train_a_comm<-train_a[,c(2,4:c)]
train_a_view<-train_a[,c(3,4:c)]
#Let's Just toss everyting into a linear regression.
lm.fit.votes <- lm(num_votes~., data=train_a_votes)
summary(lm.fit.votes)
lm.fit.comm <- lm(num_comments~., data=train_a_comm)
summary(lm.fit.comm)
lm.fit.view <- lm(num_views~., data=train_a_view)
summary(lm.fit.view)
#Now let's make our predictions for both our train_a
train_a$pvotes <- predict(lm.fit.votes)
train_a$pcomments <- predict(lm.fit.comm)
train_a$pviews <- predict(lm.fit.view)
#Start reuse code
#set predicted values
pred.votes<-train_a$pvotes
pred.comments<-train_a$pcomments
pred.views<-train_a$pviews
#set actual values
act.votes<-train_a$num_votes
act.comments<-train_a$num_comments
act.views<-train_a$num_views
#let's assign some intermediate variables so we can reuse code more easily.
#Start reuse code
p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views)
pf<-data.frame("train_a", "regression", p[1],p[2],p[3],mean(p))
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle")
results<-rbind(results,pf)
#end reuse code
#check out how we did compared with the other code. A nice improvement!
View(results)
#This is an example of cross validation.
pred.votes<- predict(lm.fit.votes, newdata=train_b)
pred.comments <- predict(lm.fit.comm, newdata=train_b)
pred.views <- predict(lm.fit.view, newdata=train_b)
#set actual values
act.votes<-train_b$num_votes
act.comments<-train_b$num_comments
act.views<-train_b$num_views
#calcuate performance
p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views)
pf<-data.frame("train_b", "regression", p[1],p[2],p[3],mean(p))
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle")
results<-rbind(results,pf)
#end reuse code
#OK, Let's try to predict our test data. Get prepared for an error.
pred.votes<- predict(lm.fit.votes, newdata=test)
#Basically this error means that a factor varaible in the test set has a level we don't know about. Let's just
#recode it to unknown.
test$tag[(test$tag=="bus_lane" )] = "unknown"
#OK, Let's try again.
pred.votes<- predict(lm.fit.votes, newdata=test)
pred.comments <- predict(lm.fit.comm, newdata=test)
pred.views <- predict(lm.fit.view, newdata=test)
#Now let's use this to generate our submission file.
submitfile ("regression3.csv", test$id, pred.votes, pred.comments, pred.views )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment