jkuruzovich/kaggle.r

## kaggle.r


#Start of Kaggle See Click Predict
setwd("~/Dropbox/30_classes/analytics/2013_fall/kaggle2")
train <- read.csv("train.csv")
test <- read.csv("test.csv")
str(train)
head(train)
names(train)


#Of all the ways of predicting something, the first we will explore is just taking the mean
train.votes.mean<-mean(train$num_votes)
train.comments.mean<-mean(train$num_comments)
train.views.mean<-mean(train$num_views)

#There are lots of ways to assess performance.
#https://www.kaggle.com/wiki/Metrics/history/1012
#We will create 2 functions that will give us the RMSE and the MAE

# Function that returns Root Mean Squared Error
rmse <- function(actual,predicted)
{
  error<-actual - predicted
  sqrt(mean(error^2))
}

# Function that returns Mean Absolute Error
mae <- function(actual,predicted)
{
  error<-actual - predicted
  mean(abs(error))
}

#Let's create a data frame that keeps track of how we do with performance.
results = data.frame(matrix(vector(), 0, 8, dimnames=list(c(), c("data", "method", "votes.rmse", "votes.mae","comments.rmse", "comments.mae","views.rmse", "views.mae"))), stringsAsFactors=F)

#let's assign some intermediate variables so we can reuse code more easily.
pred.votes<-train.votes.mean
pred.comments<-train.comments.mean
pred.views<-train.views.mean
act.votes<-train$num_votes
act.comments<-train$num_comments
act.views<-train$num_views


#This calculates the error for each DV
perf.votes.rmse<-rmse(act.votes, pred.votes)
perf.votes.mae<-mae(act.votes, pred.votes)
perf.comments.rmse<-rmse(act.comments, pred.comments)
perf.comments.mae<-mae(act.comments, pred.comments)
perf.views.rmse<-rmse(act.views, pred.views)
perf.views.mae<-mae(act.views, pred.views)

#This creates a summary table we can use to keep track of the performance of different models.
perfsummary <- data.frame("train", "mean", perf.votes.rmse, perf.votes.mae, perf.comments.rmse,perf.comments.mae,perf.views.rmse,perf.views.mae)
colnames(perfsummary)<-c("data", "method", "votes.rmse", "votes.mae","comments.rmse", "comments.mae","views.rmse", "views.mae")
results<-rbind(results,perfsummary)

#Now let's use this to generate test data for our data based on this model.
test_submit<-data.frame(test$id,pred.views,pred.votes,pred.comments)
colnames(test_submit)<-c("id", "num_views","num_votes", "num_comments")
write.csv(test_submit, "submitmean.csv",row.names=FALSE)

#Submission of this landed me at 425.  KGI = 1.1915.  Compared to .29 for the leaders.
#1.19153

#Now let's apply a simple regression model.

#From last class we did an analysis that showed 4 Cities.  We are going to just use longitute/latituted to calculate.
#1  41.85662  -87.68507	   1 Chicago, IL
#2	2	37.54046	-77.46269	 2 Richmond, VA
#3	3	41.31132	-72.92412	 3 New Haven, CT
#4	4	37.80239	-122.24116 4 Oakland, CA
setwd("~/Dropbox/30_classes/analytics/2013_fall/kaggle2")

#Initially pull in only 5000 rows to work with data.
train <- read.csv("train.csv", nrows=5000)
str(train)
head(train)
names(train)

#One hypothesis could be that in general city is likely to be meaningful.  To judge how meaningful our
#predictions are we are going to have to compare them to something.  Let's compare to just guessing the
#overall mean each time.

train$city <- cut(train$longitude,
                     breaks=c(-Inf, -104, -82.5, -75.2, Inf),
                     labels=c('Oakland','Chicago', 'Richmond', 'New Haven'))


#Look to see potential variables
str(train)

#We can see that created_time is incorrectly being identified as a factor, going to change first to string
time_char<-as.character((train$created_time))
#Then to POSIX (time)
train$time<-as.POSIXct(strptime(time_char, format = "%Y-%m-%d %H:%M:%S"))


#Now select a minima
train_min<-train[,c('num_votes','num_comments','num_views','latitude', 'longitude', 'time', 'city')]

#verify complete data
train_min[!complete.cases(train_min),]


#to see how we do from here out, we can split our training set in 2
set.seed(1234)
ind <- sample(2, nrow(train_min), replace=TRUE, prob=c(0.5,0.5))


#Split the data
train_a <- train_min[ind==1,]
train_b <- train_min[ind==2,]

names(train_a)
train_a_votes<-train_a[,c(1,4:7)]
train_a_comm<-train_a[,c(2,4:7)]
train_a_view<-train_a[,c(3,4:7)]

#Let's Just toss everyting into a linear regression.
lm.fit.votes <- lm(num_votes~., data=train_a_votes)
summary(lm.fit.votes)
lm.fit.comm <- lm(num_comments~., data=train_a_comm)
summary(lm.fit.comm)
lm.fit.view <- lm(num_views~., data=train_a_view)
summary(lm.fit.view)

#Now let's make our predictions for both our train_a
train_a$pvotes <- predict(lm.fit.votes)
train_a$pcomments <- predict(lm.fit.comm)
train_a$pviews <- predict(llm.fit.view)

test$city <- cut(test$longitude,
                 breaks=c(-Inf, -104, -82.5, -75.2, Inf),
                 labels=c('Oakland','Chicago', 'Richmond', 'New Haven'))
time_char_test<-as.character((test$created_time))
#Then to POSIX (time)
test$time<-as.POSIXct(strptime(time_char_test, format = "%Y-%m-%d %H:%M:%S"))

test$num_votes <- predict(lm.fit.votes, newdata=test)
test$num_comments <- predict(lm.fit.comm, newdata=test)
test$num_views <- predict(lm.fit.view, newdata=test)

test$num_votes<- ifelse(test$num_votes<0,0,test$num_votes)
test$num_comments<- ifelse(test$num_comments<0,0,test$num_comments)
test$num_views<- ifelse(test$num_views<0,0,test$num_views)

test_submit2<-data.frame(test$id,test$num_views,test$num_votes,test$num_comments)
colnames(test_submit2)<-c("id", "num_views","num_votes", "num_comments")
write.csv(test_submit2, "submitreg3.csv",row.names=FALSE)


lm.fit.comments <- lm(num_comments~ ., data=train)
lm.predict.comments <- predict(lm.fit.comments)
lm.perf.votes<-mean((lm.predict.comments - train$num_comments)^2)
lm.fit.views <- lm(num_views~ ., data=training)


	#Start of Kaggle See Click Predict
	setwd("~/Dropbox/30_classes/analytics/2013_fall/kaggle2")
	train <- read.csv("train.csv")
	test <- read.csv("test.csv")
	str(train)
	head(train)
	names(train)


	#Of all the ways of predicting something, the first we will explore is just taking the mean
	train.votes.mean<-mean(train$num_votes)
	train.comments.mean<-mean(train$num_comments)
	train.views.mean<-mean(train$num_views)

	#There are lots of ways to assess performance.
	#https://www.kaggle.com/wiki/Metrics/history/1012
	#We will create 2 functions that will give us the RMSE and the MAE

	# Function that returns Root Mean Squared Error
	rmse <- function(actual,predicted)
	{
	error<-actual - predicted
	sqrt(mean(error^2))
	}

	# Function that returns Mean Absolute Error
	mae <- function(actual,predicted)
	{
	error<-actual - predicted
	mean(abs(error))
	}

	#Let's create a data frame that keeps track of how we do with performance.
	results = data.frame(matrix(vector(), 0, 8, dimnames=list(c(), c("data", "method", "votes.rmse", "votes.mae","comments.rmse", "comments.mae","views.rmse", "views.mae"))), stringsAsFactors=F)

	#let's assign some intermediate variables so we can reuse code more easily.
	pred.votes<-train.votes.mean
	pred.comments<-train.comments.mean
	pred.views<-train.views.mean
	act.votes<-train$num_votes
	act.comments<-train$num_comments
	act.views<-train$num_views


	#This calculates the error for each DV
	perf.votes.rmse<-rmse(act.votes, pred.votes)
	perf.votes.mae<-mae(act.votes, pred.votes)
	perf.comments.rmse<-rmse(act.comments, pred.comments)
	perf.comments.mae<-mae(act.comments, pred.comments)
	perf.views.rmse<-rmse(act.views, pred.views)
	perf.views.mae<-mae(act.views, pred.views)

	#This creates a summary table we can use to keep track of the performance of different models.
	perfsummary <- data.frame("train", "mean", perf.votes.rmse, perf.votes.mae, perf.comments.rmse,perf.comments.mae,perf.views.rmse,perf.views.mae)
	colnames(perfsummary)<-c("data", "method", "votes.rmse", "votes.mae","comments.rmse", "comments.mae","views.rmse", "views.mae")
	results<-rbind(results,perfsummary)

	#Now let's use this to generate test data for our data based on this model.
	test_submit<-data.frame(test$id,pred.views,pred.votes,pred.comments)
	colnames(test_submit)<-c("id", "num_views","num_votes", "num_comments")
	write.csv(test_submit, "submitmean.csv",row.names=FALSE)

	#Submission of this landed me at 425. KGI = 1.1915. Compared to .29 for the leaders.
	#1.19153

	#Now let's apply a simple regression model.

	#From last class we did an analysis that showed 4 Cities. We are going to just use longitute/latituted to calculate.
	#1 41.85662 -87.68507 1 Chicago, IL
	#2 2 37.54046 -77.46269 2 Richmond, VA
	#3 3 41.31132 -72.92412 3 New Haven, CT
	#4 4 37.80239 -122.24116 4 Oakland, CA
	setwd("~/Dropbox/30_classes/analytics/2013_fall/kaggle2")

	#Initially pull in only 5000 rows to work with data.
	train <- read.csv("train.csv", nrows=5000)
	str(train)
	head(train)
	names(train)

	#One hypothesis could be that in general city is likely to be meaningful. To judge how meaningful our
	#predictions are we are going to have to compare them to something. Let's compare to just guessing the
	#overall mean each time.

	train$city <- cut(train$longitude,
	breaks=c(-Inf, -104, -82.5, -75.2, Inf),
	labels=c('Oakland','Chicago', 'Richmond', 'New Haven'))


	#Look to see potential variables
	str(train)

	#We can see that created_time is incorrectly being identified as a factor, going to change first to string
	time_char<-as.character((train$created_time))
	#Then to POSIX (time)
	train$time<-as.POSIXct(strptime(time_char, format = "%Y-%m-%d %H:%M:%S"))


	#Now select a minima
	train_min<-train[,c('num_votes','num_comments','num_views','latitude', 'longitude', 'time', 'city')]

	#verify complete data
	train_min[!complete.cases(train_min),]


	#to see how we do from here out, we can split our training set in 2
	set.seed(1234)
	ind <- sample(2, nrow(train_min), replace=TRUE, prob=c(0.5,0.5))



	#Split the data
	train_a <- train_min[ind==1,]
	train_b <- train_min[ind==2,]

	names(train_a)
	train_a_votes<-train_a[,c(1,4:7)]
	train_a_comm<-train_a[,c(2,4:7)]
	train_a_view<-train_a[,c(3,4:7)]

	#Let's Just toss everyting into a linear regression.
	lm.fit.votes <- lm(num_votes~., data=train_a_votes)
	summary(lm.fit.votes)
	lm.fit.comm <- lm(num_comments~., data=train_a_comm)
	summary(lm.fit.comm)
	lm.fit.view <- lm(num_views~., data=train_a_view)
	summary(lm.fit.view)

	#Now let's make our predictions for both our train_a
	train_a$pvotes <- predict(lm.fit.votes)
	train_a$pcomments <- predict(lm.fit.comm)
	train_a$pviews <- predict(llm.fit.view)

	test$city <- cut(test$longitude,
	breaks=c(-Inf, -104, -82.5, -75.2, Inf),
	labels=c('Oakland','Chicago', 'Richmond', 'New Haven'))
	time_char_test<-as.character((test$created_time))
	#Then to POSIX (time)
	test$time<-as.POSIXct(strptime(time_char_test, format = "%Y-%m-%d %H:%M:%S"))

	test$num_votes <- predict(lm.fit.votes, newdata=test)
	test$num_comments <- predict(lm.fit.comm, newdata=test)
	test$num_views <- predict(lm.fit.view, newdata=test)

	test$num_votes<- ifelse(test$num_votes<0,0,test$num_votes)
	test$num_comments<- ifelse(test$num_comments<0,0,test$num_comments)
	test$num_views<- ifelse(test$num_views<0,0,test$num_views)

	test_submit2<-data.frame(test$id,test$num_views,test$num_votes,test$num_comments)
	colnames(test_submit2)<-c("id", "num_views","num_votes", "num_comments")
	write.csv(test_submit2, "submitreg3.csv",row.names=FALSE)


	lm.fit.comments <- lm(num_comments~ ., data=train)
	lm.predict.comments <- predict(lm.fit.comments)
	lm.perf.votes<-mean((lm.predict.comments - train$num_comments)^2)
	lm.fit.views <- lm(num_views~ ., data=training)