Last active
December 29, 2015 09:29
-
-
Save jkuruzovich/7650448 to your computer and use it in GitHub Desktop.
Kaggle Competition See predict Fix Help
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Start of Kaggle See Click Predict | |
setwd("~/Dropbox/30_classes/analytics/2013_fall/kaggle2") | |
train <- read.csv("train.csv") | |
test <- read.csv("test.csv") | |
str(train) | |
head(train) | |
names(train) | |
#Of all the ways of predicting something, the first we will explore is just taking the mean | |
train.votes.mean<-mean(train$num_votes) | |
train.comments.mean<-mean(train$num_comments) | |
train.views.mean<-mean(train$num_views) | |
#There are lots of ways to assess performance. | |
#https://www.kaggle.com/wiki/Metrics/history/1012 | |
#We will create 2 functions that will give us the RMSE and the MAE | |
# Function that returns Root Mean Squared Error | |
rmse <- function(actual,predicted) | |
{ | |
error<-actual - predicted | |
sqrt(mean(error^2)) | |
} | |
# Function that returns Mean Absolute Error | |
mae <- function(actual,predicted) | |
{ | |
error<-actual - predicted | |
mean(abs(error)) | |
} | |
#Let's create a data frame that keeps track of how we do with performance. | |
results = data.frame(matrix(vector(), 0, 8, dimnames=list(c(), c("data", "method", "votes.rmse", "votes.mae","comments.rmse", "comments.mae","views.rmse", "views.mae"))), stringsAsFactors=F) | |
#let's assign some intermediate variables so we can reuse code more easily. | |
pred.votes<-train.votes.mean | |
pred.comments<-train.comments.mean | |
pred.views<-train.views.mean | |
act.votes<-train$num_votes | |
act.comments<-train$num_comments | |
act.views<-train$num_views | |
#This calculates the error for each DV | |
perf.votes.rmse<-rmse(act.votes, pred.votes) | |
perf.votes.mae<-mae(act.votes, pred.votes) | |
perf.comments.rmse<-rmse(act.comments, pred.comments) | |
perf.comments.mae<-mae(act.comments, pred.comments) | |
perf.views.rmse<-rmse(act.views, pred.views) | |
perf.views.mae<-mae(act.views, pred.views) | |
#This creates a summary table we can use to keep track of the performance of different models. | |
perfsummary <- data.frame("train", "mean", perf.votes.rmse, perf.votes.mae, perf.comments.rmse,perf.comments.mae,perf.views.rmse,perf.views.mae) | |
colnames(perfsummary)<-c("data", "method", "votes.rmse", "votes.mae","comments.rmse", "comments.mae","views.rmse", "views.mae") | |
results<-rbind(results,perfsummary) | |
#Now let's use this to generate test data for our data based on this model. | |
test_submit<-data.frame(test$id,pred.views,pred.votes,pred.comments) | |
colnames(test_submit)<-c("id", "num_views","num_votes", "num_comments") | |
write.csv(test_submit, "submitmean.csv",row.names=FALSE) | |
#Submission of this landed me at 425. KGI = 1.1915. Compared to .29 for the leaders. | |
#1.19153 | |
#Now let's apply a simple regression model. | |
#From last class we did an analysis that showed 4 Cities. We are going to just use longitute/latituted to calculate. | |
#1 41.85662 -87.68507 1 Chicago, IL | |
#2 2 37.54046 -77.46269 2 Richmond, VA | |
#3 3 41.31132 -72.92412 3 New Haven, CT | |
#4 4 37.80239 -122.24116 4 Oakland, CA | |
setwd("~/Dropbox/30_classes/analytics/2013_fall/kaggle2") | |
#Initially pull in only 5000 rows to work with data. | |
train <- read.csv("train.csv", nrows=5000) | |
str(train) | |
head(train) | |
names(train) | |
#One hypothesis could be that in general city is likely to be meaningful. To judge how meaningful our | |
#predictions are we are going to have to compare them to something. Let's compare to just guessing the | |
#overall mean each time. | |
train$city <- cut(train$longitude, | |
breaks=c(-Inf, -104, -82.5, -75.2, Inf), | |
labels=c('Oakland','Chicago', 'Richmond', 'New Haven')) | |
#Look to see potential variables | |
str(train) | |
#We can see that created_time is incorrectly being identified as a factor, going to change first to string | |
time_char<-as.character((train$created_time)) | |
#Then to POSIX (time) | |
train$time<-as.POSIXct(strptime(time_char, format = "%Y-%m-%d %H:%M:%S")) | |
#Now select a minima | |
train_min<-train[,c('num_votes','num_comments','num_views','latitude', 'longitude', 'time', 'city')] | |
#verify complete data | |
train_min[!complete.cases(train_min),] | |
#to see how we do from here out, we can split our training set in 2 | |
set.seed(1234) | |
ind <- sample(2, nrow(train_min), replace=TRUE, prob=c(0.5,0.5)) | |
#Split the data | |
train_a <- train_min[ind==1,] | |
train_b <- train_min[ind==2,] | |
names(train_a) | |
train_a_votes<-train_a[,c(1,4:7)] | |
train_a_comm<-train_a[,c(2,4:7)] | |
train_a_view<-train_a[,c(3,4:7)] | |
#Let's Just toss everyting into a linear regression. | |
lm.fit.votes <- lm(num_votes~., data=train_a_votes) | |
summary(lm.fit.votes) | |
lm.fit.comm <- lm(num_comments~., data=train_a_comm) | |
summary(lm.fit.comm) | |
lm.fit.view <- lm(num_views~., data=train_a_view) | |
summary(lm.fit.view) | |
#Now let's make our predictions for both our train_a | |
train_a$pvotes <- predict(lm.fit.votes) | |
train_a$pcomments <- predict(lm.fit.comm) | |
train_a$pviews <- predict(llm.fit.view) | |
test$city <- cut(test$longitude, | |
breaks=c(-Inf, -104, -82.5, -75.2, Inf), | |
labels=c('Oakland','Chicago', 'Richmond', 'New Haven')) | |
time_char_test<-as.character((test$created_time)) | |
#Then to POSIX (time) | |
test$time<-as.POSIXct(strptime(time_char_test, format = "%Y-%m-%d %H:%M:%S")) | |
test$num_votes <- predict(lm.fit.votes, newdata=test) | |
test$num_comments <- predict(lm.fit.comm, newdata=test) | |
test$num_views <- predict(lm.fit.view, newdata=test) | |
test$num_votes<- ifelse(test$num_votes<0,0,test$num_votes) | |
test$num_comments<- ifelse(test$num_comments<0,0,test$num_comments) | |
test$num_views<- ifelse(test$num_views<0,0,test$num_views) | |
test_submit2<-data.frame(test$id,test$num_views,test$num_votes,test$num_comments) | |
colnames(test_submit2)<-c("id", "num_views","num_votes", "num_comments") | |
write.csv(test_submit2, "submitreg3.csv",row.names=FALSE) | |
lm.fit.comments <- lm(num_comments~ ., data=train) | |
lm.predict.comments <- predict(lm.fit.comments) | |
lm.perf.votes<-mean((lm.predict.comments - train$num_comments)^2) | |
lm.fit.views <- lm(num_views~ ., data=training) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment