Skip to content

Instantly share code, notes, and snippets.

@Lanbig
Created May 13, 2016 05:44
Show Gist options
  • Save Lanbig/9c365cbd372c0d7aed81011338893e91 to your computer and use it in GitHub Desktop.
Save Lanbig/9c365cbd372c0d7aed81011338893e91 to your computer and use it in GitHub Desktop.
##########
#Load data
setwd("~/Dropbox/DePaul/OnlineNewsPopularity")
OnlineNewsPopularity = read.csv("OnlineNewsPopularity.csv", header = TRUE)
###########Data Preprpcessing step##########
#Duplicate dataset
df.onNews <- OnlineNewsPopularity
#Remove ID
df.onNews$url = NULL
df.onNews$timedelta = NULL
#Change Dummy variables to factors
df.onNews$pubDay <- ifelse(OnlineNewsPopularity$weekday_is_monday == 1, "monday",
ifelse((OnlineNewsPopularity$weekday_is_tuesday == 1), "tuesday",
ifelse((OnlineNewsPopularity$weekday_is_wednesday == 1), "wednesday",
ifelse((OnlineNewsPopularity$weekday_is_thursday == 1), "thursday",
ifelse((OnlineNewsPopularity$weekday_is_friday == 1), "friday",
ifelse((OnlineNewsPopularity$weekday_is_saturday == 1), "saturday", "sunday"))))))
df.onNews$pubDay <- as.factor(df.onNews$pubDay)
df.onNews$weekday_is_monday = NULL
df.onNews$weekday_is_tuesday = NULL
df.onNews$weekday_is_wednesday = NULL
df.onNews$weekday_is_thursday = NULL
df.onNews$weekday_is_friday = NULL
df.onNews$weekday_is_saturday = NULL
df.onNews$weekday_is_sunday = NULL
df.onNews$dataChannel <- ifelse(OnlineNewsPopularity$data_channel_is_lifestyle == 1,"lifestyle",
ifelse(OnlineNewsPopularity$data_channel_is_entertainment == 1,"entertainment",
ifelse(OnlineNewsPopularity$data_channel_is_bus == 1,"business",
ifelse(OnlineNewsPopularity$data_channel_is_socmed == 1,"Social",
ifelse(OnlineNewsPopularity$data_channel_is_tech == 1,"tech", "world")))))
df.onNews$data_channel_is_lifestyle = NULL
df.onNews$data_channel_is_entertainment = NULL
df.onNews$data_channel_is_bus = NULL
df.onNews$data_channel_is_socmed = NULL
df.onNews$data_channel_is_tech = NULL
df.onNews$data_channel_is_world = NULL
df.onNews$dataChannel <- as.factor(df.onNews$dataChannel)
#Class distribution Bin data
df.onNews$share2bins <- ifelse(OnlineNewsPopularity$shares > 1300, "high", "low")
df.onNews$share2bins <- as.factor(df.onNews$share2bins)
######
library(ggplot2)
theme_set(theme_bw(base_size = 28))
qplot(df.onNews$dataChannel, geom="histogram",ylab = "count",binwidth = 30,
main = "Histogram for data channel", xlab = "Channel", fill=df.onNews$share2bins )
table(df.onNews$dataChannel,df.onNews$share2bins)
qplot(df.onNews$pubDay, geom="histogram",ylab = "count",binwidth = 30,
main = "Histogram for publication day", xlab = "Day", fill=df.onNews$share2bins )
table(df.onNews$pubDay,df.onNews$share2bins)
qplot(df.onNews$self_reference_min_shares, geom="histogram", ylab = "count",
main = "Histogram for self_reference_min_shares", xlab = "Min. shares of referenced articles in Mashable", fill=df.onNews$share2bins, xlim=c(3.8,8000) )
table(df.onNews$self_reference_min_shares,df.onNews$share2bins0)
qplot(df.onNews$kw_avg_avg, geom="histogram",ylab = "count",
main = "Histogram for kw_avg_avg", xlab = "Avg. keyword", fill=df.onNews$share2bins, xlim=c(3.8,8000) )
hist(df.onNews$shares, xlim=c(3.8,500000) )
#######
df.onNews$shares = NULL
###########Required Library##########
library(rpart.plot)
library(caret)
library(e1071)
library(klaR)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
###########Clasification modelling step##########
set.seed(415)
# define an 66%/34% train/test split of the dataset
trainIndex <- createDataPartition(df.onNews$share2bins, p=0.66, list=FALSE)
data_train <- df.onNews[ trainIndex,]
data_test <- df.onNews[-trainIndex,]
############# Decision Tree #####################
Sys.time()
fit.rpart5= train(share2bins ~ . ,data=data_train , method= "rpart", tuneLength = 5, trControl=trainControl(method="cv",number=10))
fit.rpart8= train(share2bins ~ . ,data=data_train , method= "rpart", tuneLength = 8, trControl=trainControl(method="cv",number=10))
fit.rpart13= train(share2bins ~ . ,data=data_train , method= "rpart", tuneLength = 13, trControl=trainControl(method="cv",number=10))
fit.rpart17= train(share2bins ~ . ,data=data_train , method= "rpart", tuneLength = 17, trControl=trainControl(method="cv",number=10))
fit.rpart20= train(share2bins ~ . ,data=data_train , method= "rpart", tuneLength = 20, trControl=trainControl(method="cv",number=10))
summary(fit.rpart5$finalModel)
predictions <- predict(fit.rpart8, data_test[,1:48])
confusionMatrix(predictions, data_test$share2bins)
fancyRpartPlot(fit.rpart8$finalModel)
prp(fit.rpart5$finalModel)
print(fit.rpart8)
print(fit.rpart8$finalModel)
Sys.time()
##################################################
############# Naive Bayes #####################
Sys.time()
fit.nb= train(share2bins ~ . ,data=data_train , method= "nb", trControl=trainControl(method="cv",number=10))
predictions <- predict(fit.nb, data_test[,1:58])
confusionMatrix(predictions, data_test$share2bins)
print(fit.nb$finalModel)
print(fit.nb)
Sys.time()
##################################################
############# Random Forest #####################
fit.rf = train(share2bins ~ . ,data_train , method= "rf", ntree=501 , tuneGrid = data.frame(mtry = 4),
allowParallel=TRUE,trControl=trainControl(method="cv",number=10) )
predictions <- predict(fit.rf2, data_test[,1:48])
confusionMatrix(predictions, data_test$share2bins)
print(fit.rf)
print(fit.rf$finalModel)
importance(fit.rf$finalModel)
varImpPlot(fit.rf2$finalModel)
importance(fit.rf$finalModel)
##################################################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment