jkuruzovich/cross_val

## cross_val
# This code has some additional functions that can help show you how to deal with the missing values and cross validation.

#Start of Kaggle See Click Predict
setwd("~/Dropbox/30_classes/analytics/2013_fall/kaggle2")
train <- read.csv("train.csv")
test <- read.csv("test.csv")
str(train)
head(train)
names(train)


#Of all the ways of predicting something, the first we will explore is just taking the mean
train.votes.mean<-mean(train$num_votes)
train.comments.mean<-mean(train$num_comments)
train.views.mean<-mean(train$num_views)

#There are lots of ways to assess performance.
#https://www.kaggle.com/wiki/Metrics/history/1012
#We will create 2 functions that will give us the RMSE and the MAE


# Function that returns Root Mean Log Error
rmsle <- function(actual,predicted)
{
  sqrt(mean((log(actual+1)-log(predicted+1))^2))
}

submitfile <-function(filename, id, pred.votes,pred.comments,pred.views )
{
  pred.votes<-round(pred.votes, digits = 0)
  pred.comments<-round(pred.comments, digits = 0)
  pred.views<-round(pred.views, digits = 0)
  pred.votes<- ifelse(pred.votes<0,0,pred.votes)
  pred.comments<- ifelse(pred.comments<0,0,pred.comments)
  pred.views<- ifelse(pred.views<0,0,pred.views)
  submit<-data.frame(id,pred.views,pred.votes,pred.comments)
  colnames(submit)<-c("id", "num_views","num_votes", "num_comments")
  write.csv(submit, filename,row.names=FALSE)
}


perf <- function(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views )
{
  pred.votes<-round(pred.votes, digits = 0)
  pred.comments<-round(pred.comments, digits = 0)
  pred.views<-round(pred.views, digits = 0)
  pred.votes<- ifelse(pred.votes<0,0,pred.votes)
  pred.comments<- ifelse(pred.comments<0,0,pred.comments)
  pred.views<- ifelse(pred.views<0,0,pred.views)

  #This calculates the error for rmsle as indicated in the analysis.
  perf.votes.rmsle<-rmsle(act.votes, pred.votes)
  perf.comments.rmsle<-rmsle(act.comments, pred.comments)
  perf.views.rmsle<-rmsle(act.views, pred.views)
  perfsummary <- c(perf.votes.rmsle, perf.comments.rmsle,perf.views.rmsle)


}

#Let's create a data frame that keeps track of how we do with performance.
results <-data.frame(matrix(vector(), 0, 6, dimnames=list(c(), c("data", "method", "votes.rmsle","comments.rmsle", "views.rmsle","avg.rmsle"))), stringsAsFactors=F)

#let's assign some intermediate variables so we can reuse code more easily.
#Start reuse code
pred.votes<-train.votes.mean
pred.comments<-train.comments.mean
pred.views<-train.views.mean
act.votes<-train$num_votes
act.comments<-train$num_comments
act.views<-train$num_views
p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views)
pf<-data.frame("train", "mean", p[1],p[2],p[3],mean(p))
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle")
results<-rbind(results,pf)
write.csv(results, "results.csv",row.names=FALSE)
#end reuse code


#Now let's use this to generate test data for our data based on this model using a function.
submitfile ("functionoutfile3", test$id, pred.votes, pred.comments, pred.views )

#Submission of this landed me at 425.  KGI = 1.1915.  Compared to .29 for the leaders.
#1.19153
#There is a discussion here of how to use the model
# https://www.kaggle.com/c/see-click-predict-fix/forums/t/6378/rmsle-vs-target

#Now let's apply a simple regression model.

#From last class we did an analysis that showed 4 Cities.  We are going to just use longitute/latituted to calculate.
#1  41.85662  -87.68507	   1 Chicago, IL
#2	2	37.54046	-77.46269	 2 Richmond, VA
#3	3	41.31132	-72.92412	 3 New Haven, CT
#4	4	37.80239	-122.24116 4 Oakland, CA
setwd("~/Dropbox/30_classes/analytics/2013_fall/kaggle2")

#Initially pull in only 5000 rows to work with data.
#train <- read.csv("train.csv", nrows=5000)
str(train)
head(train)
names(train)

#One hypothesis could be that in general city is likely to be meaningful.  To judge how meaningful our
#predictions are we are going to have to compare them to something.  Let's compare to just guessing the
#overall mean each time.
# We are also going to do all data fixes to our test file so we are ready to submit.

train$city <- cut(train$longitude,
                     breaks=c(-Inf, -104, -82.5, -75.2, Inf),
                     labels=c('Oakland','Chicago', 'Richmond', 'New Haven'))
test$city <- cut(test$longitude,
                  breaks=c(-Inf, -104, -82.5, -75.2, Inf),
                  labels=c('Oakland','Chicago', 'Richmond', 'New Haven'))

#Look to see potential variables
str(train)

#We can see that created_time is incorrectly being identified as a factor, going to change first to string
time_char<-as.character((train$created_time))
#Then to POSIX (time)
train$time<-as.POSIXct(strptime(time_char, format = "%Y-%m-%d %H:%M:%S"))
train$source = factor(train$source, levels=c(levels(train$source), "unknown"))
train$source[is.na(train$source )] = "unknown"

train$tag = factor(train$tag, levels=c(levels(train$tag), "unknown"))
train$tag[is.na(train$tag )] = "unknown"

#test data
time_chart<-as.character((test$created_time))
#Then to POSIX (time)
test$time<-as.POSIXct(strptime(time_chart, format = "%Y-%m-%d %H:%M:%S"))
test$source = factor(test$source, levels=c(levels(test$source), "unknown"))
test$source[is.na(test$source )] = "unknown"

test$tag = factor(test$tag, levels=c(levels(test$tag), "unknown"))
test$tag[is.na(test$tag )] = "unknown"


#Now select a minima
train_min<-train[,c('num_votes','num_comments','num_views','latitude', 'longitude', 'time', 'tag', 'source','city')]

#verify complete data
train_min[!complete.cases(train_min),]


#to see how we do from here out, we can split our training set in 2
set.seed(1234)
ind <- sample(2, nrow(train_min), replace=TRUE, prob=c(0.5,0.5))

#Split the data
train_a <- train_min[ind==1,]
train_b <- train_min[ind==2,]

c<-ncol(train_a)
train_a_votes<-train_a[,c(1,4:c)]
train_a_comm<-train_a[,c(2,4:c)]
train_a_view<-train_a[,c(3,4:c)]

#Let's Just toss everyting into a linear regression.
lm.fit.votes <- lm(num_votes~., data=train_a_votes)
summary(lm.fit.votes)
lm.fit.comm <- lm(num_comments~., data=train_a_comm)
summary(lm.fit.comm)
lm.fit.view <- lm(num_views~., data=train_a_view)
summary(lm.fit.view)

#Now let's make our predictions for both our train_a
train_a$pvotes <- predict(lm.fit.votes)
train_a$pcomments <- predict(lm.fit.comm)
train_a$pviews <- predict(lm.fit.view)

#Start reuse code
#set predicted values
pred.votes<-train_a$pvotes
pred.comments<-train_a$pcomments
pred.views<-train_a$pviews

#set actual values
act.votes<-train_a$num_votes
act.comments<-train_a$num_comments
act.views<-train_a$num_views

#let's assign some intermediate variables so we can reuse code more easily.
#Start reuse code
p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views)
pf<-data.frame("train_a", "regression", p[1],p[2],p[3],mean(p))
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle")
results<-rbind(results,pf)
#end reuse code

#check out how we did compared with the other code. A nice improvement!
View(results)

#This is an example of cross validation.
pred.votes<- predict(lm.fit.votes, newdata=train_b)
pred.comments <- predict(lm.fit.comm, newdata=train_b)
pred.views <- predict(lm.fit.view, newdata=train_b)

#set actual values
act.votes<-train_b$num_votes
act.comments<-train_b$num_comments
act.views<-train_b$num_views

#calcuate performance
p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views)
pf<-data.frame("train_b", "regression", p[1],p[2],p[3],mean(p))
colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle")
results<-rbind(results,pf)
#end reuse code


#OK, Let's try to predict our test data. Get prepared for an error.
pred.votes<- predict(lm.fit.votes, newdata=test)

#Basically this error means that a factor varaible in the test set has a level we don't know about. Let's just
#recode it to unknown.
test$tag[(test$tag=="bus_lane" )] = "unknown"

#OK, Let's try again.
pred.votes<- predict(lm.fit.votes, newdata=test)
pred.comments <- predict(lm.fit.comm, newdata=test)
pred.views <- predict(lm.fit.view, newdata=test)

#Now let's use this to generate our submission file.
submitfile ("regression3.csv", test$id, pred.votes, pred.comments, pred.views )
	# This code has some additional functions that can help show you how to deal with the missing values and cross validation.

	#Start of Kaggle See Click Predict
	setwd("~/Dropbox/30_classes/analytics/2013_fall/kaggle2")
	train <- read.csv("train.csv")
	test <- read.csv("test.csv")
	str(train)
	head(train)
	names(train)


	#Of all the ways of predicting something, the first we will explore is just taking the mean
	train.votes.mean<-mean(train$num_votes)
	train.comments.mean<-mean(train$num_comments)
	train.views.mean<-mean(train$num_views)

	#There are lots of ways to assess performance.
	#https://www.kaggle.com/wiki/Metrics/history/1012
	#We will create 2 functions that will give us the RMSE and the MAE


	# Function that returns Root Mean Log Error
	rmsle <- function(actual,predicted)
	{
	sqrt(mean((log(actual+1)-log(predicted+1))^2))
	}

	submitfile <-function(filename, id, pred.votes,pred.comments,pred.views )
	{
	pred.votes<-round(pred.votes, digits = 0)
	pred.comments<-round(pred.comments, digits = 0)
	pred.views<-round(pred.views, digits = 0)
	pred.votes<- ifelse(pred.votes<0,0,pred.votes)
	pred.comments<- ifelse(pred.comments<0,0,pred.comments)
	pred.views<- ifelse(pred.views<0,0,pred.views)
	submit<-data.frame(id,pred.views,pred.votes,pred.comments)
	colnames(submit)<-c("id", "num_views","num_votes", "num_comments")
	write.csv(submit, filename,row.names=FALSE)
	}



	perf <- function(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views )
	{
	pred.votes<-round(pred.votes, digits = 0)
	pred.comments<-round(pred.comments, digits = 0)
	pred.views<-round(pred.views, digits = 0)
	pred.votes<- ifelse(pred.votes<0,0,pred.votes)
	pred.comments<- ifelse(pred.comments<0,0,pred.comments)
	pred.views<- ifelse(pred.views<0,0,pred.views)

	#This calculates the error for rmsle as indicated in the analysis.
	perf.votes.rmsle<-rmsle(act.votes, pred.votes)
	perf.comments.rmsle<-rmsle(act.comments, pred.comments)
	perf.views.rmsle<-rmsle(act.views, pred.views)
	perfsummary <- c(perf.votes.rmsle, perf.comments.rmsle,perf.views.rmsle)


	}

	#Let's create a data frame that keeps track of how we do with performance.
	results <-data.frame(matrix(vector(), 0, 6, dimnames=list(c(), c("data", "method", "votes.rmsle","comments.rmsle", "views.rmsle","avg.rmsle"))), stringsAsFactors=F)

	#let's assign some intermediate variables so we can reuse code more easily.
	#Start reuse code
	pred.votes<-train.votes.mean
	pred.comments<-train.comments.mean
	pred.views<-train.views.mean
	act.votes<-train$num_votes
	act.comments<-train$num_comments
	act.views<-train$num_views
	p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views)
	pf<-data.frame("train", "mean", p[1],p[2],p[3],mean(p))
	colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle")
	results<-rbind(results,pf)
	write.csv(results, "results.csv",row.names=FALSE)
	#end reuse code


	#Now let's use this to generate test data for our data based on this model using a function.
	submitfile ("functionoutfile3", test$id, pred.votes, pred.comments, pred.views )

	#Submission of this landed me at 425. KGI = 1.1915. Compared to .29 for the leaders.
	#1.19153
	#There is a discussion here of how to use the model
	# https://www.kaggle.com/c/see-click-predict-fix/forums/t/6378/rmsle-vs-target

	#Now let's apply a simple regression model.

	#From last class we did an analysis that showed 4 Cities. We are going to just use longitute/latituted to calculate.
	#1 41.85662 -87.68507 1 Chicago, IL
	#2 2 37.54046 -77.46269 2 Richmond, VA
	#3 3 41.31132 -72.92412 3 New Haven, CT
	#4 4 37.80239 -122.24116 4 Oakland, CA
	setwd("~/Dropbox/30_classes/analytics/2013_fall/kaggle2")

	#Initially pull in only 5000 rows to work with data.
	#train <- read.csv("train.csv", nrows=5000)
	str(train)
	head(train)
	names(train)

	#One hypothesis could be that in general city is likely to be meaningful. To judge how meaningful our
	#predictions are we are going to have to compare them to something. Let's compare to just guessing the
	#overall mean each time.
	# We are also going to do all data fixes to our test file so we are ready to submit.

	train$city <- cut(train$longitude,
	breaks=c(-Inf, -104, -82.5, -75.2, Inf),
	labels=c('Oakland','Chicago', 'Richmond', 'New Haven'))
	test$city <- cut(test$longitude,
	breaks=c(-Inf, -104, -82.5, -75.2, Inf),
	labels=c('Oakland','Chicago', 'Richmond', 'New Haven'))

	#Look to see potential variables
	str(train)

	#We can see that created_time is incorrectly being identified as a factor, going to change first to string
	time_char<-as.character((train$created_time))
	#Then to POSIX (time)
	train$time<-as.POSIXct(strptime(time_char, format = "%Y-%m-%d %H:%M:%S"))
	train$source = factor(train$source, levels=c(levels(train$source), "unknown"))
	train$source[is.na(train$source )] = "unknown"

	train$tag = factor(train$tag, levels=c(levels(train$tag), "unknown"))
	train$tag[is.na(train$tag )] = "unknown"

	#test data
	time_chart<-as.character((test$created_time))
	#Then to POSIX (time)
	test$time<-as.POSIXct(strptime(time_chart, format = "%Y-%m-%d %H:%M:%S"))
	test$source = factor(test$source, levels=c(levels(test$source), "unknown"))
	test$source[is.na(test$source )] = "unknown"

	test$tag = factor(test$tag, levels=c(levels(test$tag), "unknown"))
	test$tag[is.na(test$tag )] = "unknown"


	#Now select a minima
	train_min<-train[,c('num_votes','num_comments','num_views','latitude', 'longitude', 'time', 'tag', 'source','city')]

	#verify complete data
	train_min[!complete.cases(train_min),]


	#to see how we do from here out, we can split our training set in 2
	set.seed(1234)
	ind <- sample(2, nrow(train_min), replace=TRUE, prob=c(0.5,0.5))

	#Split the data
	train_a <- train_min[ind==1,]
	train_b <- train_min[ind==2,]

	c<-ncol(train_a)
	train_a_votes<-train_a[,c(1,4:c)]
	train_a_comm<-train_a[,c(2,4:c)]
	train_a_view<-train_a[,c(3,4:c)]

	#Let's Just toss everyting into a linear regression.
	lm.fit.votes <- lm(num_votes~., data=train_a_votes)
	summary(lm.fit.votes)
	lm.fit.comm <- lm(num_comments~., data=train_a_comm)
	summary(lm.fit.comm)
	lm.fit.view <- lm(num_views~., data=train_a_view)
	summary(lm.fit.view)

	#Now let's make our predictions for both our train_a
	train_a$pvotes <- predict(lm.fit.votes)
	train_a$pcomments <- predict(lm.fit.comm)
	train_a$pviews <- predict(lm.fit.view)

	#Start reuse code
	#set predicted values
	pred.votes<-train_a$pvotes
	pred.comments<-train_a$pcomments
	pred.views<-train_a$pviews

	#set actual values
	act.votes<-train_a$num_votes
	act.comments<-train_a$num_comments
	act.views<-train_a$num_views

	#let's assign some intermediate variables so we can reuse code more easily.
	#Start reuse code
	p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views)
	pf<-data.frame("train_a", "regression", p[1],p[2],p[3],mean(p))
	colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle")
	results<-rbind(results,pf)
	#end reuse code

	#check out how we did compared with the other code. A nice improvement!
	View(results)

	#This is an example of cross validation.
	pred.votes<- predict(lm.fit.votes, newdata=train_b)
	pred.comments <- predict(lm.fit.comm, newdata=train_b)
	pred.views <- predict(lm.fit.view, newdata=train_b)

	#set actual values
	act.votes<-train_b$num_votes
	act.comments<-train_b$num_comments
	act.views<-train_b$num_views

	#calcuate performance
	p<-perf(act.votes, pred.votes,act.comments, pred.comments, act.views, pred.views)
	pf<-data.frame("train_b", "regression", p[1],p[2],p[3],mean(p))
	colnames(pf)<-c("data", "method", "votes.rmsle", "comments.rmsle", "views.rmsle", "mean.rmsle")
	results<-rbind(results,pf)
	#end reuse code


	#OK, Let's try to predict our test data. Get prepared for an error.
	pred.votes<- predict(lm.fit.votes, newdata=test)

	#Basically this error means that a factor varaible in the test set has a level we don't know about. Let's just
	#recode it to unknown.
	test$tag[(test$tag=="bus_lane" )] = "unknown"

	#OK, Let's try again.
	pred.votes<- predict(lm.fit.votes, newdata=test)
	pred.comments <- predict(lm.fit.comm, newdata=test)
	pred.views <- predict(lm.fit.view, newdata=test)

	#Now let's use this to generate our submission file.
	submitfile ("regression3.csv", test$id, pred.votes, pred.comments, pred.views )