Skip to content

Instantly share code, notes, and snippets.

@jkuruzovich
Last active December 29, 2015 09:29
Show Gist options
  • Save jkuruzovich/7650448 to your computer and use it in GitHub Desktop.
Save jkuruzovich/7650448 to your computer and use it in GitHub Desktop.
Kaggle Competition See predict Fix Help
#Start of Kaggle See Click Predict
setwd("~/Dropbox/30_classes/analytics/2013_fall/kaggle2")
train <- read.csv("train.csv")
test <- read.csv("test.csv")
str(train)
head(train)
names(train)
#Of all the ways of predicting something, the first we will explore is just taking the mean
train.votes.mean<-mean(train$num_votes)
train.comments.mean<-mean(train$num_comments)
train.views.mean<-mean(train$num_views)
#There are lots of ways to assess performance.
#https://www.kaggle.com/wiki/Metrics/history/1012
#We will create 2 functions that will give us the RMSE and the MAE
# Function that returns Root Mean Squared Error
rmse <- function(actual,predicted)
{
error<-actual - predicted
sqrt(mean(error^2))
}
# Function that returns Mean Absolute Error
mae <- function(actual,predicted)
{
error<-actual - predicted
mean(abs(error))
}
#Let's create a data frame that keeps track of how we do with performance.
results = data.frame(matrix(vector(), 0, 8, dimnames=list(c(), c("data", "method", "votes.rmse", "votes.mae","comments.rmse", "comments.mae","views.rmse", "views.mae"))), stringsAsFactors=F)
#let's assign some intermediate variables so we can reuse code more easily.
pred.votes<-train.votes.mean
pred.comments<-train.comments.mean
pred.views<-train.views.mean
act.votes<-train$num_votes
act.comments<-train$num_comments
act.views<-train$num_views
#This calculates the error for each DV
perf.votes.rmse<-rmse(act.votes, pred.votes)
perf.votes.mae<-mae(act.votes, pred.votes)
perf.comments.rmse<-rmse(act.comments, pred.comments)
perf.comments.mae<-mae(act.comments, pred.comments)
perf.views.rmse<-rmse(act.views, pred.views)
perf.views.mae<-mae(act.views, pred.views)
#This creates a summary table we can use to keep track of the performance of different models.
perfsummary <- data.frame("train", "mean", perf.votes.rmse, perf.votes.mae, perf.comments.rmse,perf.comments.mae,perf.views.rmse,perf.views.mae)
colnames(perfsummary)<-c("data", "method", "votes.rmse", "votes.mae","comments.rmse", "comments.mae","views.rmse", "views.mae")
results<-rbind(results,perfsummary)
#Now let's use this to generate test data for our data based on this model.
test_submit<-data.frame(test$id,pred.views,pred.votes,pred.comments)
colnames(test_submit)<-c("id", "num_views","num_votes", "num_comments")
write.csv(test_submit, "submitmean.csv",row.names=FALSE)
#Submission of this landed me at 425. KGI = 1.1915. Compared to .29 for the leaders.
#1.19153
#Now let's apply a simple regression model.
#From last class we did an analysis that showed 4 Cities. We are going to just use longitute/latituted to calculate.
#1 41.85662 -87.68507 1 Chicago, IL
#2 2 37.54046 -77.46269 2 Richmond, VA
#3 3 41.31132 -72.92412 3 New Haven, CT
#4 4 37.80239 -122.24116 4 Oakland, CA
setwd("~/Dropbox/30_classes/analytics/2013_fall/kaggle2")
#Initially pull in only 5000 rows to work with data.
train <- read.csv("train.csv", nrows=5000)
str(train)
head(train)
names(train)
#One hypothesis could be that in general city is likely to be meaningful. To judge how meaningful our
#predictions are we are going to have to compare them to something. Let's compare to just guessing the
#overall mean each time.
train$city <- cut(train$longitude,
breaks=c(-Inf, -104, -82.5, -75.2, Inf),
labels=c('Oakland','Chicago', 'Richmond', 'New Haven'))
#Look to see potential variables
str(train)
#We can see that created_time is incorrectly being identified as a factor, going to change first to string
time_char<-as.character((train$created_time))
#Then to POSIX (time)
train$time<-as.POSIXct(strptime(time_char, format = "%Y-%m-%d %H:%M:%S"))
#Now select a minima
train_min<-train[,c('num_votes','num_comments','num_views','latitude', 'longitude', 'time', 'city')]
#verify complete data
train_min[!complete.cases(train_min),]
#to see how we do from here out, we can split our training set in 2
set.seed(1234)
ind <- sample(2, nrow(train_min), replace=TRUE, prob=c(0.5,0.5))
#Split the data
train_a <- train_min[ind==1,]
train_b <- train_min[ind==2,]
names(train_a)
train_a_votes<-train_a[,c(1,4:7)]
train_a_comm<-train_a[,c(2,4:7)]
train_a_view<-train_a[,c(3,4:7)]
#Let's Just toss everyting into a linear regression.
lm.fit.votes <- lm(num_votes~., data=train_a_votes)
summary(lm.fit.votes)
lm.fit.comm <- lm(num_comments~., data=train_a_comm)
summary(lm.fit.comm)
lm.fit.view <- lm(num_views~., data=train_a_view)
summary(lm.fit.view)
#Now let's make our predictions for both our train_a
train_a$pvotes <- predict(lm.fit.votes)
train_a$pcomments <- predict(lm.fit.comm)
train_a$pviews <- predict(llm.fit.view)
test$city <- cut(test$longitude,
breaks=c(-Inf, -104, -82.5, -75.2, Inf),
labels=c('Oakland','Chicago', 'Richmond', 'New Haven'))
time_char_test<-as.character((test$created_time))
#Then to POSIX (time)
test$time<-as.POSIXct(strptime(time_char_test, format = "%Y-%m-%d %H:%M:%S"))
test$num_votes <- predict(lm.fit.votes, newdata=test)
test$num_comments <- predict(lm.fit.comm, newdata=test)
test$num_views <- predict(lm.fit.view, newdata=test)
test$num_votes<- ifelse(test$num_votes<0,0,test$num_votes)
test$num_comments<- ifelse(test$num_comments<0,0,test$num_comments)
test$num_views<- ifelse(test$num_views<0,0,test$num_views)
test_submit2<-data.frame(test$id,test$num_views,test$num_votes,test$num_comments)
colnames(test_submit2)<-c("id", "num_views","num_votes", "num_comments")
write.csv(test_submit2, "submitreg3.csv",row.names=FALSE)
lm.fit.comments <- lm(num_comments~ ., data=train)
lm.predict.comments <- predict(lm.fit.comments)
lm.perf.votes<-mean((lm.predict.comments - train$num_comments)^2)
lm.fit.views <- lm(num_views~ ., data=training)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment